loadPkg = function(x) { if (!require(x,character.only=T, quietly =T)) { install.packages(x,dep=T,repos="http://cran.us.r-project.org"); if(!require(x,character.only=T)) stop("Package not found") } }
#load some packages that will be used
loadPkg("randomForest")
loadPkg("DMwR")
loadPkg("ggplot2")
loadPkg("Metrics")
loadPkg("FNN")
loadPkg("leaps")
loadPkg("ISLR")
loadPkg("tree")
loadPkg("rpart")
loadPkg("rpart.plot")
loadPkg("rattle") # For fancyRpartPlot (Trees) Answer "no" on installing from binary source
loadPkg("corrplot")

Chapter 1. Introduction

Chapter 2. Preparation

Import data

Before importing the data, we preprocessed the raw data in Python to obtain a nicer data frame as the raw data contains some columns written in JSON format with many attributes.

# read csv file 
df <- read.csv("Movie.csv")
str(df) # have a glance at the data by seeing its structure
## 'data.frame':    4803 obs. of  12 variables:
##  $ X                   : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ budget              : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ genres              : Factor w/ 21 levels "","Action","Adventure",..: 2 3 2 2 2 10 4 2 3 2 ...
##  $ popularity          : num  150.4 139.1 107.4 112.3 43.9 ...
##  $ production_companies: Factor w/ 1314 levels "","100 Bares",..: 615 1263 265 696 1263 265 1263 758 1267 320 ...
##  $ release_date        : Factor w/ 3281 levels "","1916-09-04",..: 2315 1945 3185 2688 2635 1940 2450 3111 2246 3234 ...
##  $ revenue             : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime             : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ title               : Factor w/ 4800 levels "(500) Days of Summer",..: 381 2653 3186 3614 1906 3198 3364 382 1587 444 ...
##  $ vote_average        : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote_count          : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ Number_Genres       : int  4 3 3 4 3 3 2 3 3 3 ...

Data Cleanining

#remove na
movie = na.omit(df)

#remove duplicates
movie <- unique(movie)

#remove movies wiht no budget, no revenue, no genre, no vote, no score 
#we can build some models and use predict function to fill the missing values, but there are not many missing values and we do not know which models are good for prediction so we just remove them
movie <- subset(movie, !((movie$budget == 0) | (movie$revenue == 0) | (movie$Number_Genres == 0) | (movie$vote_count == 0) 
                         | (movie$vote_average == 0) ))

#str(movie)
# rename column and create profit columns

colnames(movie)[c(5,6,10,11,12)] = c("company","date","score","vote","genrecount") # rename some columns

movie$profit = movie$revenue - movie$budget # create profit column

movie$profitable <- c(0) # create profitable column which indicates whether a movie has negative or posite profit

for (i in 1:nrow(movie)) {if (movie[i,"profit"] > 0) {movie[i,"profitable"] = 1} } # positive profit is labeled as 1, negative is 0

movie$profitable = as.factor(movie$profitable) # make sure the profitable outputs are factors
# preprocess company column

movie$company = as.character(movie$company)  # change company to character for easier operation

# Below is the list of studios in 5 largest parent studios in the world. 
# I collect information from wiki and some websites
# Although the lists may lack some minor studios, they include the majority of popular ones.

warner = c("Warner Bros.", "Warner Bros. Pictures", "Warner Bros. Animation", "Warner Bros. Family Entertainment", "Warner Brothers/Seven Arts",  "New Line Cinema", "DC Comics","Castle Rock Entertainment")

universal = c("Universal","Universal Pictures", "Universal Pictures Corporation", "Universal Studios", "DreamWorks", "DreamWorks Animation", "DreamWorks SKG", "Amblin Entertainment", "Working Title Films", "Focus Features", "Focus Films","Gramercy Pictures")

paramount = c("Paramount Pictures", "Paramount", "Paramount Classics", "Paramount Vantage","MTV Films","Nickelodeon Movies")

walt_disney = c("Walt Disney", "Walt Disney Animation Studios", "Walt Disney Pictures", "Walt Disney Productions", "Walt Disney Studios Motion Pictures", "Walt Disney Television Animation", "Buena Vista", "Touchstone Pictures", "Hollywood Pictures", "Caravan Pictures","Miramax","Miramax Films","Dimension Films","Marvel Studios", "Twentieth Century Fox Film Corporation", "Blue Sky Studios","Lucasfilm","Fox 2000 Pictures", "Fox Atomic","Fox Entertainment Group","Fox Searchlight Pictures","UTV Motion Pictures")

sony = c("Columbia Pictures","Columbia Pictures Corporation", "Columnbia Pictures Industries", "Sony Pictures", "Sony Pictures Animation", "Sony Pictures Classics", "TriStar Pictures","Destination Films") 

#other film companies will be saved into "Others" group
# assign new values to the company column using the list of 5 largerst movie companies above
for (i in 1:nrow(movie)) {
  if (movie[i,"company"]%in%warner) {movie[i,"company"] = "Warner Bros"} 
  
  else if(movie[i,"company"]%in%universal) {movie[i,"company"] = "Universal Pictures"}
  
  else if(movie[i,"company"]%in%paramount) {movie[i,"company"] = "Paramount Pictures"}
  
  else if(movie[i,"company"]%in%walt_disney) {movie[i,"company"] = "Walt Disney"}
  
  else if(movie[i,"company"]%in%sony) {movie[i,"company"] = "Sony Pictures"}
  
  else (movie[i,"company"] = "Others") # movies which are not product of 5 biggest movie companies are saved as "Others"
  
} 

movie$company = as.factor(movie$company) # change company column back to factor type
genre_count = data.frame(table(movie$genres)) # count number of movies in each genre
#genre_count # seeing the results
# there is one record for Foreign type, we will remove it so that when we split train and test sets we do not have to deal with the levels of factor in each set. Foreign is an unpopular genre and there is only one record so it won't affect our model and analysis

movie <- subset(movie, ! (movie$genres == "Foreign")) # subset movie excluding Foreign

row.names(movie) <- NULL #reorder row names since after removing rows, the order is not natural
movie$genres <- factor(as.character(movie$genres)) # reasign the levels of factor for genres
movie$title <- factor(movie$title) # reorder title factor levels


# conver date column to date formate as Y-m-d
loadPkg("lubridate")
movie$date <- as.character(movie$date)  # change the date column to character
movie$date <- as.Date(movie$date, format = "%Y-%m-%d") # save it as date format


str(movie)  # check the structure now
## 'data.frame':    3225 obs. of  14 variables:
##  $ X         : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ budget    : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
##  $ popularity: num  150.4 139.1 107.4 112.3 43.9 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
##  $ date      : Date, format: "2009-12-10" "2007-05-19" ...
##  $ revenue   : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime   : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ title     : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
##  $ score     : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote      : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ genrecount: int  4 3 3 4 3 3 2 3 3 3 ...
##  $ profit    : num  2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
##  $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# create a season column, at first, we save month data to it
movie$season <- month(movie$date)
movies <- movie
colnames(movies)[c(15)] <- c('month')
movie$quarter <- c(1) # create a quarter column which will be used for time series model


# apply quarters 
for (i in 1:nrow(movie)) {
  if (movie[i,"season"]%in%c(1,2,3)) {movie[i,"quarter"] = "Q1"} # Quarter 1 for Jan, Feb, Mar
  
  else if(movie[i,"season"]%in%c(4,5,6)) {movie[i,"quarter"] = "Q2"} # Quarter 2 for April, May, June
  
  else if(movie[i,"season"]%in%c(7,8,9)) {movie[i,"quarter"] = "Q3"} # Quarter 3 for July, Aug, Sep
  
  else {movie[i,"quarter"] = "Q4"} # Quarter 4 for Oct, Nov and Dec
  
  
} 

# now apply season following meteorological definition
for (i in 1:nrow(movie)) {
  if (movie[i,"season"]%in%c(3,4,5)) {movie[i,"season"] = "Spring"} 
  
  else if(movie[i,"season"]%in%c(6,7,8)) {movie[i,"season"] = "Summer"}
  
  else if(movie[i,"season"]%in%c(9,10,11)) {movie[i,"season"] = "Fall"}
  
  else {movie[i,"season"] = "Winter"}
  
  
} 

movie$season <- as.factor(movie$season) # change the season column to factor
movie$season <- factor(movie$season, levels = c("Spring", "Summer","Fall","Winter")) # reorder factor levels

movie$quarter <- as.factor(movie$quarter) # change quarter column to factor

movie$year <- year(movie$date) # create a year column which will be used for time series
# we won't use the genrecount column and the first column X that labeled the number of movie
# we have that X column as default index while cleaning the data using Python 
# now, remove X column and genrecount column which are not necessary
movie <- movie[-c(1,12)] 


str(movie) # check the structure of our data after cleaning
## 'data.frame':    3225 obs. of  15 variables:
##  $ budget    : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
##  $ popularity: num  150.4 139.1 107.4 112.3 43.9 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
##  $ date      : Date, format: "2009-12-10" "2007-05-19" ...
##  $ revenue   : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime   : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ title     : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
##  $ score     : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote      : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ profit    : num  2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
##  $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 4 1 3 2 1 1 3 1 2 1 ...
##  $ quarter   : Factor w/ 4 levels "Q1","Q2","Q3",..: 4 2 4 3 1 2 4 2 3 1 ...
##  $ year      : num  2009 2007 2015 2012 2012 ...

Data Summary

summary(movie) # summary of the whole data
##      budget               genres      popularity 
##  Min.   :1.00e+00   Drama    :745   Min.   :  0  
##  1st Qu.:1.05e+07   Comedy   :634   1st Qu.: 10  
##  Median :2.50e+07   Action   :588   Median : 20  
##  Mean   :4.07e+07   Adventure:288   Mean   : 29  
##  3rd Qu.:5.50e+07   Horror   :197   3rd Qu.: 37  
##  Max.   :3.80e+08   Crime    :141   Max.   :876  
##                     (Other)  :632                
##                company          date               revenue        
##  Others            :1636   Min.   :1916-09-04   Min.   :5.00e+00  
##  Paramount Pictures: 255   1st Qu.:1998-09-10   1st Qu.:1.71e+07  
##  Sony Pictures     : 277   Median :2005-07-20   Median :5.52e+07  
##  Universal Pictures: 338   Mean   :2002-03-18   Mean   :1.21e+08  
##  Walt Disney       : 497   3rd Qu.:2010-11-11   3rd Qu.:1.46e+08  
##  Warner Bros       : 222   Max.   :2016-09-09   Max.   :2.79e+09  
##                                                                   
##     runtime                           title          score     
##  Min.   : 41   The Host                  :   2   Min.   :2.30  
##  1st Qu.: 96   (500) Days of Summer      :   1   1st Qu.:5.80  
##  Median :107   [REC]                     :   1   Median :6.30  
##  Mean   :111   [REC]²                    :   1   Mean   :6.31  
##  3rd Qu.:121   10 Cloverfield Lane       :   1   3rd Qu.:6.90  
##  Max.   :338   10 Things I Hate About You:   1   Max.   :8.50  
##                (Other)                   :3218                 
##       vote           profit          profitable    season    quarter 
##  Min.   :    1   Min.   :-1.66e+08   0: 787     Spring:704   Q1:656  
##  1st Qu.:  179   1st Qu.: 2.52e+05   1:2438     Summer:837   Q2:757  
##  Median :  471   Median : 2.64e+07              Fall  :930   Q3:931  
##  Mean   :  978   Mean   : 8.07e+07              Winter:754   Q4:881  
##  3rd Qu.: 1148   3rd Qu.: 9.75e+07                                   
##  Max.   :13752   Max.   : 2.55e+09                                   
##                                                                      
##       year     
##  Min.   :1916  
##  1st Qu.:1998  
##  Median :2005  
##  Mean   :2002  
##  3rd Qu.:2010  
##  Max.   :2016  
## 

We need to scale the data, too high difference.

summary(movie[c(6,1,3,7,9,10,11)]) # summary of the numerical variables
##     revenue             budget           popularity     runtime   
##  Min.   :5.00e+00   Min.   :1.00e+00   Min.   :  0   Min.   : 41  
##  1st Qu.:1.71e+07   1st Qu.:1.05e+07   1st Qu.: 10   1st Qu.: 96  
##  Median :5.52e+07   Median :2.50e+07   Median : 20   Median :107  
##  Mean   :1.21e+08   Mean   :4.07e+07   Mean   : 29   Mean   :111  
##  3rd Qu.:1.46e+08   3rd Qu.:5.50e+07   3rd Qu.: 37   3rd Qu.:121  
##  Max.   :2.79e+09   Max.   :3.80e+08   Max.   :876   Max.   :338  
##      score           vote           profit         
##  Min.   :2.30   Min.   :    1   Min.   :-1.66e+08  
##  1st Qu.:5.80   1st Qu.:  179   1st Qu.: 2.52e+05  
##  Median :6.30   Median :  471   Median : 2.64e+07  
##  Mean   :6.31   Mean   :  978   Mean   : 8.07e+07  
##  3rd Qu.:6.90   3rd Qu.: 1148   3rd Qu.: 9.75e+07  
##  Max.   :8.50   Max.   :13752   Max.   : 2.55e+09

Variance and SD

sapply(movie[c(6,1,3,7,9,10,11)], sd) # check the sd
##    revenue     budget popularity    runtime      score       vote 
##   1.86e+08   4.44e+07   3.62e+01   2.10e+01   8.60e-01   1.41e+03 
##     profit 
##   1.58e+08
sapply(movie[c(6,1,3,7,9,10,11)], var) # check the var
##    revenue     budget popularity    runtime      score       vote 
##   3.47e+16   1.97e+15   1.31e+03   4.40e+02   7.39e-01   2.00e+06 
##     profit 
##   2.50e+16

The means, variance and sd between variables are quite high as most of them have different scales. We need to scale the data for some models like linear regression, PCR, KNN, etc.

Data visualization

Numnber of movies by company

comp_count = data.frame(table(movie$company)) #count the number of movies by company and save to a dataframe

# plot the number of movies by company using ggplot()
ggplot(comp_count, aes(x = reorder(Var1, Freq), y = Freq)) +  # order by the number of movies
  geom_bar(stat = 'identity', fill ='gold', alpha = 0.9) + coord_flip() +
  ylab("Number of movies") +
  xlab("Companies") +
  ggtitle(("Number of movies by companies")) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) 

Number of movie by season

# we plot the number of movies by season
season_count = data.frame(table(movie$season))


ggplot(season_count, aes(x = Var1, y = Freq, fill = Var1)) + 
  geom_bar(stat = 'identity', alpha = 0.9) +
  ylab("Number of movies") +
  xlab("Season") +
  ggtitle(("Number of movies by season")) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) +
  guides(fill=guide_legend(title = "Season"))

# and number of movies by genre
genres_count = data.frame(table(movie$genres))


ggplot(genres_count, aes(x = reorder(Var1, Freq), y = Freq)) + 
  geom_bar(stat = 'identity', fill = "dodgerblue", alpha = 0.9) + coord_flip() +
  ylab("Number of movies") +
  xlab("Genre") +
  ggtitle(("Number of movies by genre")) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) 

Chapter 3. Revenue Prediction

In this section, we will try three models: Linear Regression, Decision Tree (Regression Tree since the response is continuous variable) and Random Forest. In each model, we will choose the best formula (adj R2, BIC, CP for Linear Regression, pruning for Regression Tree, tuning for Random Forest). And then we compare the three best models.

Dependency

Numerical Variables

loadPkg("corrplot")

movie_num <- subset(movie, select = c(6,1,3,7,9,10)) # choose numerical variables excluding profit to be the data for later model

movie_num_all <- data.frame(cbind(movie_num,profit=movie$profit)) # this one includes profit 

#cordf_all = cor(movie_num_all)
#corrplot(cordf_all)


cordf =  cor(movie_num) # cor matrix

corrplot(cordf) # correlation plot

  • We can check the effect of categorical variables by anova test.

Genre

Test frequency distributions of revenue in different genres

anov_genre = aov(revenue ~ genres, data = movie) # anova test for the null: the means of revenue in different genres are the same
summary(anov_genre)
##               Df  Sum Sq  Mean Sq F value Pr(>F)    
## genres        17 1.4e+19 8.21e+17    26.9 <2e-16 ***
## Residuals   3207 9.8e+19 3.06e+16                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#adhoc_genre <- TukeyHSD(anov_genre) # ad hoc proc to check which pairs of genres have different means of profit
#adhoc_genre

Overall, there is an evidence that the frequency distributions of revenue in different genres are not the same. It seems that revenue is dependent on genres.

Company

Check freq disbutions of revenue in different companies

anov_comp = aov(revenue ~ company, data = movie) # anova test for the null: the means of revenue in different companies are the same
summary(anov_comp)
##               Df   Sum Sq  Mean Sq F value Pr(>F)    
## company        5 4.80e+18 9.59e+17    28.8 <2e-16 ***
## Residuals   3219 1.07e+20 3.33e+16                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#adhoc_comp <- TukeyHSD(anov_comp) # ad hoc to check which pairs are significant
#adhoc_comp

Overall, there is an evidence that the frequency distributions of revenue in different companies are not the same. It seems that revenue is dependent on companies.

Season

anov_ss= aov(revenue ~ season, data = movie)  # anova test for the null: the means of revenue in different seasons are the same
summary(anov_ss)
##               Df   Sum Sq  Mean Sq F value  Pr(>F)    
## season         3 1.59e+18 5.31e+17    15.5 5.3e-10 ***
## Residuals   3221 1.10e+20 3.43e+16                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adhoc_ss <- TukeyHSD(anov_ss) # check which pairs are significant
adhoc_ss
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = revenue ~ season, data = movie)
## 
## $season
##                    diff       lwr       upr p adj
## Summer-Spring   2809117 -21525012  27143245 0.991
## Fall-Spring   -46272179 -70043962 -22500396 0.000
## Winter-Spring -37859214 -62797712 -12920716 0.001
## Fall-Summer   -49081296 -71752657 -26409934 0.000
## Winter-Summer -40668330 -64560204 -16776456 0.000
## Winter-Fall     8412966 -14905901  31731832 0.790

It seems that winter - fall are in the same group and spring - summer are in the same group.

Training and testing sets

# scale the data since the vars and s.ds among variables are significant

# every numerical variables are scaled except the response "revenue"
scale_movie <- as.data.frame(scale(movie_num[c(2:6)], center = TRUE, scale = TRUE)) 


str(movie_num_all)
## 'data.frame':    3225 obs. of  7 variables:
##  $ revenue   : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ budget    : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ popularity: num  150.4 139.1 107.4 112.3 43.9 ...
##  $ runtime   : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ score     : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote      : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ profit    : num  2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
str(scale_movie)
## 'data.frame':    3225 obs. of  5 variables:
##  $ budget    : num  4.42 5.84 4.6 4.71 4.94 ...
##  $ popularity: num  3.355 3.041 2.165 2.301 0.411 ...
##  $ runtime   : num  2.44 2.78 1.78 2.59 1.01 ...
##  $ score     : num  1.031 0.6821 -0.0157 1.4963 -0.2483 ...
##  $ vote      : num  7.65 2.49 2.47 5.74 0.81 ...
set.seed(1000)

movie_sample <- sample(2, nrow(scale_movie), replace=TRUE, prob=c(0.67, 0.33)) #  split the data with ratio 67:33
xtrain1 <- scale_movie[movie_sample==1, 1:5] # train set contaning predictors
xtest1 <- scale_movie[movie_sample==2, 1:5] # test set containing predictors



ytrain1 <- movie_num[movie_sample==1, 1] # train set containing the response "revenue"
ytest1 <- movie_num[movie_sample==2, 1] # test set containing the response "revenue"

# we can combine them into an overall data frame
train1 <- as.data.frame(cbind(revenue = ytrain1,xtrain1)) 
test1 <- as.data.frame(cbind(revenue = ytest1,xtest1))

str(train1)
## 'data.frame':    2176 obs. of  6 variables:
##  $ revenue   : num  2.79e+09 8.81e+08 2.84e+08 8.91e+08 1.41e+09 ...
##  $ budget    : num  4.42 4.6 4.94 4.89 5.39 ...
##  $ popularity: num  3.355 2.165 0.411 2.395 2.908 ...
##  $ runtime   : num  2.44 1.78 1.01 1.35 1.44 ...
##  $ score     : num  1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
##  $ vote      : num  7.65 2.47 0.81 1.84 4.09 ...
str(test1)
## 'data.frame':    1049 obs. of  6 variables:
##  $ revenue   : num  9.61e+08 1.08e+09 5.92e+08 5.86e+08 8.93e+07 ...
##  $ budget    : num  5.84 4.71 4.94 3.59 4.83 ...
##  $ popularity: num  3.041 2.301 0.542 2.18 0.552 ...
##  $ runtime   : num  2.779 2.588 -0.511 -0.225 1.825 ...
##  $ score     : num  0.682 1.496 1.264 -0.248 -0.481 ...
##  $ vote      : num  2.489 5.745 1.662 1.404 0.942 ...

Linear model

Numerical Variables

Model Construction

Construct the model on training set (using all numberical variables)

loadPkg("faraway")

rev_lm1 <- lm(revenue ~., data = train1) # construct the model
summary(rev_lm1) # summary of the model
## 
## Call:
## lm(formula = revenue ~ ., data = train1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.21e+08 -3.89e+07 -1.92e+06  2.46e+07  1.60e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 122161921    2168480   56.34  < 2e-16 ***
## budget       82502895    2822244   29.23  < 2e-16 ***
## popularity   14588304    2952684    4.94  8.4e-07 ***
## runtime      -1265467    2415512   -0.52     0.60    
## score          212212    2648862    0.08     0.94    
## vote         85807055    3723449   23.05  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01e+08 on 2170 degrees of freedom
## Multiple R-squared:  0.711,  Adjusted R-squared:  0.711 
## F-statistic: 1.07e+03 on 5 and 2170 DF,  p-value: <2e-16
vif(rev_lm1) # check vifs
##     budget popularity    runtime      score       vote 
##       1.68       2.15       1.26       1.50       2.95

Prediction

Testing

# predict the model on test set
rev_lm_test1 <- predict(object = rev_lm1, test1) # predict the model on the test set
rev_lm_pred1 <- data.frame(cbind(actuals=test1$revenue, predicteds=rev_lm_test1)) # save the predicted and actual values in a data frame
rev_lm_metrics1 <- regr.eval(rev_lm_pred1$actuals, rev_lm_pred1$predicteds) # calculate metrics using regr.eval() function from DMwR package
                                                                            # return the 4 metrics (RMSE, MSE, MAE, MAPE)
rev_lm_metrics1[c(1,3)] # show the metrics MAE and RMSE 
##      mae     rmse 
## 6.25e+07 1.06e+08
# I consulted some websites about the use of MAPE and they have some example about the disadvantages of this metric
# We should not use MAPE metric for comparsion between models, RMSE and MAE may be enough

Training

# predicted (fitted) values in train set
#rev_lm_train1 <- rev_lm1$fitted.values
rev_lm_train1 <- predict(object = rev_lm1)  # same results as the above code
                                            # we do not set the new data so the predict will use the fitted values
                                            # it will give same results if using newdata = train1
                                            # however, the results are different when using models that ensemble multiple models such as RF
                                            # better use the correct format by ignoring newdata augment
                                            # same results as the code rev_lm_train1 <- rev_lm1$fitted.values 

rev_lm_predtrain1 <- data.frame(cbind(actuals=train1$revenue, predicteds=rev_lm_train1)) # save predicted and actual values in a dataframe
rev_lm_metrics_train1 <- regr.eval(rev_lm_predtrain1$actuals, rev_lm_predtrain1$predicteds) 
rev_lm_metrics_train1[c(1,3)] # we can see that rmse of fitted values is the residual standard error
##      mae     rmse 
## 5.86e+07 1.01e+08

Feature selection

reg.lm <- regsubsets(revenue~., data = train1, nbest = 1, method = "exhaustive")  # leaps, exhaustive method
plot(reg.lm, scale = "adjr2", main = "Adjusted R^2")

plot(reg.lm, scale = "bic", main = "BIC") 

plot(reg.lm, scale = "Cp", main = "Cp")

#summary(reg.lm)

All three feature selection methods show that predictors (budget + popularity + vote) form the best model.

Best Model

Construct the model on train set

rev_lm2 <- lm(revenue ~ budget + popularity + vote, data = train1) # models with budget + popularity + vote
summary(rev_lm2)
## 
## Call:
## lm(formula = revenue ~ budget + popularity + vote, data = train1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.21e+08 -3.85e+07 -2.19e+06  2.44e+07  1.60e+09 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.22e+08   2.17e+06   56.36  < 2e-16 ***
## budget      8.23e+07   2.62e+06   31.45  < 2e-16 ***
## popularity  1.46e+07   2.95e+06    4.96  7.8e-07 ***
## vote        8.57e+07   3.47e+06   24.72  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.01e+08 on 2172 degrees of freedom
## Multiple R-squared:  0.711,  Adjusted R-squared:  0.711 
## F-statistic: 1.78e+03 on 3 and 2172 DF,  p-value: <2e-16
vif(rev_lm2)
##     budget popularity       vote 
##       1.45       2.15       2.55

Prediction

rev_lm_test2 <- predict(object = rev_lm2, test1) # prediction on test set
rev_lm_pred2 <- data.frame(cbind(actuals=test1$revenue, predicteds=rev_lm_test1)) 
rev_lm_metrics2 <- regr.eval(rev_lm_pred2$actuals, rev_lm_pred2$predicteds) # metrics
rev_lm_metrics2[c(1,3)] # show the metrics 
##      mae     rmse 
## 6.25e+07 1.06e+08
rev_lm_train2 <- predict(object = rev_lm2) # predicted values on train set
rev_lm_predtrain2 <- data.frame(cbind(actuals=train1$revenue, predicteds=rev_lm_train1))
rev_lm_metrics_train2 <- regr.eval(rev_lm_predtrain2$actuals, rev_lm_predtrain2$predicteds) # metrics
rev_lm_metrics_train2[c(1,3)] # show the metrics 
##      mae     rmse 
## 5.86e+07 1.01e+08

No change when comparing to the model containing all numerical variables. We can remove unnecessary variables without reducing Adjusted R-squared or increasing RMSE. The best model is less complex and can avoid overfitting due to many predictors (high dimensionality).

Categorical and Numerical Variables

# include the genres, company and season variables to the train and test sets

train1_full <- train1 # duplicate the train set as we want to keep the two train sets separately
train1_full$genres <- movie[movie_sample==1, 1:14]$genres # append genres to the train set
train1_full$company <- movie[movie_sample==1, 1:14]$company # append the company to the train set
train1_full$season <- movie[movie_sample==1, 1:14]$season # append season


# we do the same with test set
test1_full <- test1
test1_full$genres <- movie[movie_sample==2, 1:14]$genres
test1_full$company <- movie[movie_sample==2, 1:14]$company
test1_full$season <- movie[movie_sample==2, 1:14]$season

str(train1_full)
## 'data.frame':    2176 obs. of  9 variables:
##  $ revenue   : num  2.79e+09 8.81e+08 2.84e+08 8.91e+08 1.41e+09 ...
##  $ budget    : num  4.42 4.6 4.94 4.89 5.39 ...
##  $ popularity: num  3.355 2.165 0.411 2.395 2.908 ...
##  $ runtime   : num  2.44 1.78 1.01 1.35 1.44 ...
##  $ score     : num  1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
##  $ vote      : num  7.65 2.47 0.81 1.84 4.09 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 1 1 9 1 2 1 2 2 2 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 3 5 3 5 6 6 6 5 5 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 4 3 1 1 1 2 1 2 2 1 ...
str(test1_full)
## 'data.frame':    1049 obs. of  9 variables:
##  $ revenue   : num  9.61e+08 1.08e+09 5.92e+08 5.86e+08 8.93e+07 ...
##  $ budget    : num  5.84 4.71 4.94 3.59 4.83 ...
##  $ popularity: num  3.041 2.301 0.542 2.18 0.552 ...
##  $ runtime   : num  2.779 2.588 -0.511 -0.225 1.825 ...
##  $ score     : num  0.682 1.496 1.264 -0.248 -0.481 ...
##  $ vote      : num  2.489 5.745 1.662 1.404 0.942 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 2 1 3 2 1 1 1 7 16 2 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 5 1 5 1 5 1 1 2 4 1 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 1 2 3 3 2 2 1 3 1 1 ...

Model Construction

rev_lm4 <- lm(revenue ~., data = train1_full)
summary(rev_lm4)
## 
## Call:
## lm(formula = revenue ~ ., data = train1_full)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.08e+08 -4.03e+07 -1.43e+06  2.90e+07  1.62e+09 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               108110505    6817790   15.86  < 2e-16 ***
## budget                     77348867    3033574   25.50  < 2e-16 ***
## popularity                 13617667    2932282    4.64  3.6e-06 ***
## runtime                     5900196    2594312    2.27  0.02305 *  
## score                      -1602884    2751829   -0.58  0.56030    
## vote                       87028064    3716646   23.42  < 2e-16 ***
## genresAdventure            14998048    8669568    1.73  0.08378 .  
## genresAnimation            87790816   13236699    6.63  4.2e-11 ***
## genresComedy               25268186    7163941    3.53  0.00043 ***
## genresCrime               -10102172   11467478   -0.88  0.37845    
## genresDocumentary          44711638   23188936    1.93  0.05397 .  
## genresDrama                 4923647    7294556    0.67  0.49976    
## genresFamily               82467003   20391297    4.04  5.4e-05 ***
## genresFantasy               2870822   13660650    0.21  0.83357    
## genresHistory              15689738   24998838    0.63  0.53032    
## genresHorror               17619142   10531825    1.67  0.09448 .  
## genresMusic                23079584   29354497    0.79  0.43182    
## genresMystery               6103862   24024211    0.25  0.79946    
## genresRomance              17057626   14926772    1.14  0.25327    
## genresScience Fiction     -15440346   15106799   -1.02  0.30686    
## genresThriller             -7679779   12337527   -0.62  0.53370    
## genresWar                 -56563625   33719837   -1.68  0.09360 .  
## genresWestern              -3456414   24929422   -0.14  0.88974    
## companyParamount Pictures  19120256    8303147    2.30  0.02139 *  
## companySony Pictures        4555821    7970621    0.57  0.56767    
## companyUniversal Pictures  16743343    7455117    2.25  0.02481 *  
## companyWalt Disney         16706288    6373478    2.62  0.00882 ** 
## companyWarner Bros          -535044    8559594   -0.06  0.95016    
## seasonSummer                -822742    6192423   -0.13  0.89431    
## seasonFall                 -9929435    6117507   -1.62  0.10471    
## seasonWinter               -5891796    6282550   -0.94  0.34845    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99300000 on 2145 degrees of freedom
## Multiple R-squared:  0.725,  Adjusted R-squared:  0.721 
## F-statistic:  188 on 30 and 2145 DF,  p-value: <2e-16
vif(rev_lm4)
##                    budget                popularity 
##                      2.01                      2.20 
##                   runtime                     score 
##                      1.51                      1.68 
##                      vote           genresAdventure 
##                      3.04                      1.40 
##           genresAnimation              genresComedy 
##                      1.25                      1.79 
##               genresCrime         genresDocumentary 
##                      1.26                      1.08 
##               genresDrama              genresFamily 
##                      2.06                      1.08 
##             genresFantasy             genresHistory 
##                      1.14                      1.07 
##              genresHorror               genresMusic 
##                      1.32                      1.04 
##             genresMystery             genresRomance 
##                      1.04                      1.13 
##     genresScience Fiction            genresThriller 
##                      1.11                      1.18 
##                 genresWar             genresWestern 
##                      1.03                      1.06 
## companyParamount Pictures      companySony Pictures 
##                      1.09                      1.11 
## companyUniversal Pictures        companyWalt Disney 
##                      1.12                      1.18 
##        companyWarner Bros              seasonSummer 
##                      1.08                      1.61 
##                seasonFall              seasonWinter 
##                      1.65                      1.60

The p-values and t-values indicate that there is no significance among seasons. Season seems not to be a necessary predictor.

Feature Selection

reg.lm1 <- regsubsets(revenue~., data = train1_full, nbest = 1, method = "exhaustive")  # leaps, exhaustive method
plot(reg.lm1, scale = "adjr2", main = "Adjusted R^2")

plot(reg.lm1, scale = "bic", main = "BIC") 

plot(reg.lm1, scale = "Cp", main = "Cp")

When inlcuding the season, genre and company in the model, the best numerical predictors are still budget, popularity and vote. The effects of different seasons seem not to be significant. We will build the model with these 3 numerical predictors and 2 categorical variables: genre and company.

Best Model

rev_lm3 <- lm(revenue ~ budget + vote + company + genres + popularity , data = train1_full)
summary(rev_lm3)
## 
## Call:
## lm(formula = revenue ~ budget + vote + company + genres + popularity, 
##     data = train1_full)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -6.14e+08 -4.04e+07 -8.25e+05  2.96e+07  1.62e+09 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               103783104    5493099   18.89  < 2e-16 ***
## budget                     79307464    2810718   28.22  < 2e-16 ***
## vote                       87343750    3432074   25.45  < 2e-16 ***
## companyParamount Pictures  20187352    8296723    2.43  0.01505 *  
## companySony Pictures        4706890    7968099    0.59  0.55477    
## companyUniversal Pictures  17875529    7436574    2.40  0.01631 *  
## companyWalt Disney         16364447    6369330    2.57  0.01026 *  
## companyWarner Bros          -135185    8558273   -0.02  0.98740    
## genresAdventure            14924242    8645045    1.73  0.08443 .  
## genresAnimation            80203108   12811282    6.26  4.6e-10 ***
## genresComedy               24197175    7152449    3.38  0.00073 ***
## genresCrime                -8821228   11302914   -0.78  0.43522    
## genresDocumentary          40500854   22990580    1.76  0.07827 .  
## genresDrama                 6560315    6935298    0.95  0.34429    
## genresFamily               78112299   20296238    3.85  0.00012 ***
## genresFantasy               1516100   13618042    0.11  0.91136    
## genresHistory              22335236   24701813    0.90  0.36599    
## genresHorror               15897705   10491528    1.52  0.12985    
## genresMusic                22199706   29264598    0.76  0.44818    
## genresMystery               3544411   24017226    0.15  0.88269    
## genresRomance              16669177   14880624    1.12  0.26276    
## genresScience Fiction     -15795028   15101070   -1.05  0.29570    
## genresThriller             -7928898   12337416   -0.64  0.52051    
## genresWar                 -55041648   33646312   -1.64  0.10201    
## genresWestern                726641   24731085    0.03  0.97656    
## popularity                 13447493    2930278    4.59  4.7e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 99400000 on 2150 degrees of freedom
## Multiple R-squared:  0.724,  Adjusted R-squared:  0.721 
## F-statistic:  225 on 25 and 2150 DF,  p-value: <2e-16
vif(rev_lm3)
##                    budget                      vote 
##                      1.73                      2.59 
## companyParamount Pictures      companySony Pictures 
##                      1.09                      1.11 
## companyUniversal Pictures        companyWalt Disney 
##                      1.11                      1.17 
##        companyWarner Bros           genresAdventure 
##                      1.08                      1.39 
##           genresAnimation              genresComedy 
##                      1.17                      1.78 
##               genresCrime         genresDocumentary 
##                      1.22                      1.06 
##               genresDrama              genresFamily 
##                      1.86                      1.07 
##             genresFantasy             genresHistory 
##                      1.13                      1.04 
##              genresHorror               genresMusic 
##                      1.30                      1.03 
##             genresMystery             genresRomance 
##                      1.04                      1.12 
##     genresScience Fiction            genresThriller 
##                      1.11                      1.17 
##                 genresWar             genresWestern 
##                      1.03                      1.04 
##                popularity 
##                      2.19

The adj R-squared increases by 1.0% comparing to the the best model with numerical variables.

Prediction

Testing

#loadPkg("MLmetrics")

rev_lm_test3 <- predict(object = rev_lm3, newdata = test1_full) # prediction on test set
rev_lm_predtest3 <- data.frame(cbind(actuals=test1_full$revenue, predicteds=rev_lm_test3))
rev_lm_metrics_test3 <- regr.eval(rev_lm_predtest3$actuals, rev_lm_predtest3$predicteds) # metrics (RMSE, MAE, MSE, MAPE)
rev_lm_metrics_test3[c(1,3)] # show the metrics
##      mae     rmse 
## 6.19e+07 1.05e+08

Training

rev_lm_train3 <- predict(object = rev_lm3) # predicted values in train set
rev_lm_predtrain3 <- data.frame(cbind(actuals=train1_full$revenue, predicteds=rev_lm_train3))
rev_lm_metrics_train3 <- regr.eval(rev_lm_predtrain3$actuals, rev_lm_predtrain3$predicteds) # metrics (RMSE, MAE, MSE, MAPE)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)
rev_lm_metrics_train3[c(1,3)]
##     mae    rmse 
## 5.8e+07 9.9e+07
options(scientific=T, digits = 3) # change back to our default format

A slight improvement in this model. Adj R-squared increase by 1% and RMSE in both training set and testing set slightly decrease.

AIC(object=rev_lm2) # best model with budget + popularity + vote
## [1] 86395
BIC(object=rev_lm2)
## [1] 86424
cat("\n")
AIC(object=rev_lm3) # best model including genres and company
## [1] 86343
BIC(object=rev_lm3)
## [1] 86497

Model 1: budget + vote + popularity Model 2: budget + vote + popularity + company + genres

Model 1 has higher AIC than Model 2, which indicates that Model 2 is better for predicting the revenue. Model 1 has lower BIC than Model 2, which indicates that Model 1 is better as a true function to explain the revenue. (BIC prefers simple models)

Regression Tree

With Decision Tree we can address both numerical and categorical variables in the model. We perform the prediction of revenue (a continuous response) so we use regression tree.

Tree Construction

Model Construction

We can try two functions to build a regression tree model

rev_tree0 <- tree(revenue ~ ., data=train1_full) # use tree function
summary(rev_tree0)
## 
## Regression tree:
## tree(formula = revenue ~ ., data = train1_full)
## Variables actually used in tree construction:
## [1] "vote"   "budget" "genres"
## Number of terminal nodes:  10 
## Residual mean deviance:  9.85e+15 = 2.13e+19 / 2170 
## Distribution of residuals:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -6.75e+08 -3.76e+07 -1.25e+07  0.00e+00  2.52e+07  1.24e+09
plot(rev_tree0) 
text(rev_tree0,cex=0.7)

rev_tree <- rpart(revenue ~ ., method ="anova", data=train1_full) #use rpart with method = "anova"
#plotcp(rev_tree ) # visualize cross-validation results 
summary(rev_tree ) # detailed summary of splits
## Call:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
##   n= 2176 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.3861      0     1.000  1.001 0.1190
## 2 0.1250      1     0.614  0.709 0.0853
## 3 0.0704      2     0.489  0.535 0.0660
## 4 0.0528      3     0.418  0.483 0.0641
## 5 0.0265      4     0.366  0.455 0.0642
## 6 0.0256      5     0.339  0.452 0.0643
## 7 0.0139      6     0.313  0.412 0.0525
## 8 0.0107      7     0.300  0.383 0.0515
## 9 0.0100      8     0.289  0.385 0.0516
## 
## Variable importance
##       vote popularity     budget      score     genres     season 
##         42         22         20          7          6          1 
##    runtime    company 
##          1          1 
## 
## Node number 1: 2176 observations,    complexity param=0.386
##   mean=1.2e+08, MSE=3.53e+16 
##   left son=2 (1947 obs) right son=3 (229 obs)
##   Primary splits:
##       vote       < 0.949    to the left,  improve=0.3860, (0 missing)
##       budget     < 1.17     to the left,  improve=0.3840, (0 missing)
##       popularity < 1.27     to the left,  improve=0.3370, (0 missing)
##       genres     splits as  RRRLLLLRRLLLLLRLLL, improve=0.0995, (0 missing)
##       runtime    < 0.609    to the left,  improve=0.0587, (0 missing)
##   Surrogate splits:
##       popularity < 0.833    to the left,  agree=0.948, adj=0.502, (0 split)
##       budget     < 2.45     to the left,  agree=0.911, adj=0.153, (0 split)
##       score      < 1.79     to the left,  agree=0.902, adj=0.066, (0 split)
## 
## Node number 2: 1947 observations,    complexity param=0.0704
##   mean=8e+07, MSE=9.5e+15 
##   left son=4 (1481 obs) right son=5 (466 obs)
##   Primary splits:
##       vote       < -0.099   to the left,  improve=0.2930, (0 missing)
##       popularity < -0.00948 to the left,  improve=0.2540, (0 missing)
##       budget     < 0.716    to the left,  improve=0.2480, (0 missing)
##       company    splits as  LRRRRR, improve=0.0571, (0 missing)
##       genres     splits as  LRRLLLLRRLLLLLLLLL, improve=0.0563, (0 missing)
##   Surrogate splits:
##       popularity < 0.0996   to the left,  agree=0.922, adj=0.674, (0 split)
##       budget     < 1.09     to the left,  agree=0.791, adj=0.129, (0 split)
##       score      < 2.02     to the left,  agree=0.762, adj=0.004, (0 split)
##       runtime    < 4.4      to the left,  agree=0.761, adj=0.002, (0 split)
## 
## Node number 3: 229 observations,    complexity param=0.125
##   mean=4.61e+08, MSE=1.25e+17 
##   left son=6 (101 obs) right son=7 (128 obs)
##   Primary splits:
##       budget     < 0.739    to the left,  improve=0.335, (0 missing)
##       vote       < 2.5      to the left,  improve=0.269, (0 missing)
##       genres     splits as  RRRLL-LRRLL-LLRLLR, improve=0.210, (0 missing)
##       popularity < 1.8      to the left,  improve=0.176, (0 missing)
##       runtime    < 1.18     to the left,  improve=0.093, (0 missing)
##   Surrogate splits:
##       genres     splits as  RRRLL-LRRLL-LLRLLR, agree=0.790, adj=0.525, (0 split)
##       score      < 1.44     to the right, agree=0.716, adj=0.356, (0 split)
##       vote       < 1.97     to the left,  agree=0.633, adj=0.168, (0 split)
##       popularity < 1.15     to the left,  agree=0.607, adj=0.109, (0 split)
##       season     splits as  RRLR, agree=0.607, adj=0.109, (0 split)
## 
## Node number 4: 1481 observations,    complexity param=0.0139
##   mean=5.04e+07, MSE=3.52e+15 
##   left son=8 (772 obs) right son=9 (709 obs)
##   Primary splits:
##       vote       < -0.497   to the left,  improve=0.2040, (0 missing)
##       budget     < -0.32    to the left,  improve=0.1950, (0 missing)
##       popularity < -0.38    to the left,  improve=0.1820, (0 missing)
##       company    splits as  LRRRRR,       improve=0.0553, (0 missing)
##       runtime    < 0.18     to the left,  improve=0.0165, (0 missing)
##   Surrogate splits:
##       popularity < -0.389   to the left,  agree=0.916, adj=0.825, (0 split)
##       budget     < -0.32    to the left,  agree=0.630, adj=0.227, (0 split)
##       genres     splits as  RLRLRLLLRLRLRLRRRL, agree=0.568, adj=0.097, (0 split)
##       company    splits as  LRLRLL, agree=0.565, adj=0.092, (0 split)
##       score      < -0.423   to the left,  agree=0.554, adj=0.068, (0 split)
## 
## Node number 5: 466 observations,    complexity param=0.0256
##   mean=1.74e+08, MSE=1.69e+16 
##   left son=10 (322 obs) right son=11 (144 obs)
##   Primary splits:
##       budget  < 0.649    to the left,  improve=0.2500, (0 missing)
##       genres  splits as  LLRLL-LRLRLLLLLLLL, improve=0.1220, (0 missing)
##       company splits as  LRRRRR, improve=0.0808, (0 missing)
##       score   < 0.973    to the right, improve=0.0708, (0 missing)
##       vote    < 0.432    to the left,  improve=0.0478, (0 missing)
##   Surrogate splits:
##       genres     splits as  LRRLL-LRLRLLLLLLLL, agree=0.755, adj=0.208, (0 split)
##       score      < -0.423   to the right, agree=0.710, adj=0.063, (0 split)
##       popularity < 1.54     to the left,  agree=0.695, adj=0.014, (0 split)
##       vote       < 0.909    to the left,  agree=0.695, adj=0.014, (0 split)
## 
## Node number 6: 101 observations,    complexity param=0.0107
##   mean=2.3e+08, MSE=3.14e+16 
##   left son=12 (85 obs) right son=13 (16 obs)
##   Primary splits:
##       genres     splits as  LRRLL-L-LLL-RLLLL-, improve=0.2590, (0 missing)
##       vote       < 2.55     to the left,  improve=0.2120, (0 missing)
##       budget     < -0.192   to the left,  improve=0.1350, (0 missing)
##       popularity < 1.81     to the left,  improve=0.0997, (0 missing)
##       season     splits as  RRLR, improve=0.0770, (0 missing)
##   Surrogate splits:
##       runtime < -0.941   to the right, agree=0.851, adj=0.063, (0 split)
##       score   < -0.365   to the right, agree=0.851, adj=0.063, (0 split)
## 
## Node number 7: 128 observations,    complexity param=0.0528
##   mean=6.43e+08, MSE=1.24e+17 
##   left son=14 (71 obs) right son=15 (57 obs)
##   Primary splits:
##       vote       < 2.44     to the left,  improve=0.2550, (0 missing)
##       budget     < 3.78     to the left,  improve=0.2390, (0 missing)
##       popularity < 3.14     to the left,  improve=0.2330, (0 missing)
##       runtime    < 1.13     to the left,  improve=0.0954, (0 missing)
##       score      < -0.772   to the left,  improve=0.0715, (0 missing)
##   Surrogate splits:
##       score      < 0.624    to the left,  agree=0.719, adj=0.368, (0 split)
##       popularity < 1.67     to the left,  agree=0.711, adj=0.351, (0 split)
##       runtime    < 1.13     to the left,  agree=0.656, adj=0.228, (0 split)
##       budget     < 2.57     to the left,  agree=0.641, adj=0.193, (0 split)
##       company    splits as  LLLLRR,       agree=0.633, adj=0.175, (0 split)
## 
## Node number 8: 772 observations
##   mean=2.47e+07, MSE=7.93e+14 
## 
## Node number 9: 709 observations
##   mean=7.84e+07, MSE=4.98e+15 
## 
## Node number 10: 322 observations
##   mean=1.3e+08, MSE=9.82e+15 
## 
## Node number 11: 144 observations
##   mean=2.71e+08, MSE=1.91e+16 
## 
## Node number 12: 85 observations
##   mean=1.91e+08, MSE=1.99e+16 
## 
## Node number 13: 16 observations
##   mean=4.38e+08, MSE=4.15e+16 
## 
## Node number 14: 71 observations
##   mean=4.83e+08, MSE=4.95e+16 
## 
## Node number 15: 57 observations,    complexity param=0.0265
##   mean=8.41e+08, MSE=1.46e+17 
##   left son=30 (47 obs) right son=31 (10 obs)
##   Primary splits:
##       budget     < 3.98     to the left,  improve=0.2450, (0 missing)
##       popularity < 2.88     to the left,  improve=0.2080, (0 missing)
##       vote       < 4.63     to the left,  improve=0.1260, (0 missing)
##       score      < 1.32     to the right, improve=0.0656, (0 missing)
##       genres     splits as  RRR-L-LRL-----R--L, improve=0.0449, (0 missing)
##   Surrogate splits:
##       vote < 7.31     to the left,  agree=0.842, adj=0.1, (0 split)
## 
## Node number 30: 47 observations
##   mean=7.54e+08, MSE=6.86e+16 
## 
## Node number 31: 10 observations
##   mean=1.25e+09, MSE=3.06e+17
#printcp(rev_tree ) # display the results 
rev_tree.rsq = 1 - printcp(rev_tree)[9,3] # we can calculate R-quared using equation: R-squared = 1 - rel error
## 
## Regression tree:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
## 
## Variables actually used in tree construction:
## [1] budget genres vote  
## 
## Root node error: 8e+19/2176 = 4e+16
## 
## n= 2176 
## 
##     CP nsplit rel error xerror xstd
## 1 0.39      0       1.0    1.0 0.12
## 2 0.13      1       0.6    0.7 0.09
## 3 0.07      2       0.5    0.5 0.07
## 4 0.05      3       0.4    0.5 0.06
## 5 0.03      4       0.4    0.5 0.06
## 6 0.03      5       0.3    0.5 0.06
## 7 0.01      6       0.3    0.4 0.05
## 8 0.01      7       0.3    0.4 0.05
## 9 0.01      8       0.3    0.4 0.05
                                          # I see this equation in stackoverflow.com
                                          # I'm not sure if it is 100% correct
# visualize the tree using 2 methods prp() and fancyRpartPlot()
prp(rev_tree, main = "Classification Trees for Revenue")

fancyRpartPlot(rev_tree)

2 methods give the same tree. rpart() gives access to nicer plots.

Prediction

Testing

rev_tree_pred.test <- predict(rev_tree, newdata = test1_full)
actual_pred_tree.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_tree_pred.test))
metrics_test_tree <- regr.eval(actual_pred_tree.test$actuals, actual_pred_tree.test$predicteds)
metrics_test_tree[c(1,3)]
##      mae     rmse 
## 6.54e+07 1.13e+08

Training

rev_tree_pred.train <- predict(rev_tree, train1_full)
actual_pred_tree.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_tree_pred.train))
metrics_train_tree <- regr.eval(actual_pred_tree.train$actuals, actual_pred_tree.train$predicteds)
metrics_train_tree[c(1,3)]
##      mae     rmse 
## 5.93e+07 1.01e+08

The results seem to be worse than linear models.

Pruned Tree

Pruning

We can see the error for each Cp.

printcp(rev_tree)
## 
## Regression tree:
## rpart(formula = revenue ~ ., data = train1_full, method = "anova")
## 
## Variables actually used in tree construction:
## [1] budget genres vote  
## 
## Root node error: 8e+19/2176 = 4e+16
## 
## n= 2176 
## 
##     CP nsplit rel error xerror xstd
## 1 0.39      0       1.0    1.0 0.12
## 2 0.13      1       0.6    0.7 0.09
## 3 0.07      2       0.5    0.5 0.07
## 4 0.05      3       0.4    0.5 0.06
## 5 0.03      4       0.4    0.5 0.06
## 6 0.03      5       0.3    0.5 0.06
## 7 0.01      6       0.3    0.4 0.05
## 8 0.01      7       0.3    0.4 0.05
## 9 0.01      8       0.3    0.4 0.05
plotcp(rev_tree) # visualize cross-validation results 

# prune the tree for optimization and avoid overfitting


rev_tree$cptable[,"xerror"] # we can see the lowest xerror
##     1     2     3     4     5     6     7     8     9 
## 1.001 0.709 0.535 0.483 0.455 0.452 0.412 0.383 0.385

The error is lowest at CP = 8.

# choose Cp with mininum xerror and prune the tree
rev_tree_prune <- prune(rev_tree, cp = rev_tree$cptable[8,"CP"]) 
fancyRpartPlot(rev_tree_prune)

Prediction

Testing

# choose Cp with mininum xerror and prune the tree
#rev_tree_prune <- prune(rev_tree, cp = rev_tree$cptable[8,"CP"]) 


# testing
rev_tree_pred.test <- predict(rev_tree_prune, newdata = test1_full)
actual_pred_tree.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_tree_pred.test))
metrics_test_tree <- regr.eval(actual_pred_tree.test$actuals, actual_pred_tree.test$predicteds)
metrics_test_tree[c(1,3)]
##      mae     rmse 
## 6.59e+07 1.15e+08

Training

# training
rev_tree_pred.train <- predict(rev_tree_prune)
actual_pred_tree.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_tree_pred.train))
metrics_train_tree <- regr.eval(actual_pred_tree.train$actuals, actual_pred_tree.train$predicteds)
metrics_train_tree[c(1,3)]
##      mae     rmse 
## 6.04e+07 1.03e+08
# we can see the results are the same as in the original tree

There are not significant changes in the results.

Random Forest

Random Forest Model

The data contains many variables. Furthermore, we are predicting a testing data using the model constructed on training set. Therefore, Random Forest (RF) would be better than Decision Tree (which prefers fewer variables and predicts within the sample/training data).

Model Construction

set.seed(123) # set.seed to make sure the selection of tree samples are the same if we re-run the code

rev_rf = randomForest(revenue~. , train1_full, ntree = 350) # construct RF model on training data, we set the number of trees as 350

rev_rf # the results of the model
## 
## Call:
##  randomForest(formula = revenue ~ ., data = train1_full, ntree = 350) 
##                Type of random forest: regression
##                      Number of trees: 350
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 9.46e+15
##                     % Var explained: 73.2
randomForest::importance(rev_rf) # the importance of each predictor
##            IncNodePurity
## budget          1.90e+19
## popularity      1.43e+19
## runtime         4.15e+18
## score           3.39e+18
## vote            2.33e+19
## genres          5.58e+18
## company         2.60e+18
## season          1.49e+18

Budget + popularity + score have highest importances. In our best linear model, the selected numerical variables are also the same as Random Forest. Two models seem to have an agreement.

There is a difference when accommodating categorical variables. Linear Model does not select runtime but selects company and genres to be added, while in Random Forest model runtime is more importanct than company. In this case, both models seem to agree on the genres variable only.

MSE by the number of trees

plot(rev_rf)

#rev_rf$mse

When the number of trees increase, the mean squared error MSE decease. After a number of trees (around 100 trees in our case), the MSE does not have any significant change.

Testing

# predicting the model on the testing data
rev_rf.test <- predict(rev_rf, test1_full) 
actual_pred_rf.test <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_rf.test))
metrics_test_rf  <- regr.eval(actual_pred_rf.test$actuals, actual_pred_rf.test$predicteds)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)

metrics_test_rf[c(1,3)]
##     mae    rmse 
## 5.5e+07 9.9e+07
options(scientific=T, digits = 3) # change back to our default format

Training

#rev_rf$predicted
rev_rf.train <- predict(rev_rf) # same results as rev_rf$predicted, both codes return the predicted values in the training set
actual_pred_rf.train <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_rf.train))
metrics_train_rf <- regr.eval(actual_pred_rf.train$actuals, actual_pred_rf.train$predicteds)
options(scientific=T, digits = 2) # to make the format of metrics the same as above metrics (as Xe+07)


metrics_train_rf[c(1,3)]
##     mae    rmse 
## 5.3e+07 9.7e+07
options(scientific=T, digits = 3) # change back to our default format

Random Forest has lower RMSEs and MAEs than Linear Model.

The pseudo R-squared of RF is slightly higher than Adj R-squared of Linear Model.

Hyperparameter Tuning

Let’s look if we can make our RF model better by using tuning method.

# there is a parameter called mtry which is used to tune a random forest model

# we use tuneRF() function to train different models with different mtry and see which mtry gives lowest OOB (Out-Of-Bag) Error
set.seed(123)              
tune.rf <- tuneRF(x = train1_full[-c(1)], y = train1_full$revenue, ntreeTry = 350)
## mtry = 2  OOB error = 9.46e+15 
## Searching left ...
## mtry = 1     OOB error = 1.06e+16 
## -0.118 0.05 
## Searching right ...
## mtry = 4     OOB error = 9.33e+15 
## 0.014 0.05

best_mtry <- tune.rf[,"mtry"][which.min(tune.rf[,"OOBError"])] # retrieve the mtry with lowest OOB error

#best_mtry # show the best mtry value

mtry giving lowest OOB Error is 4. Now, let’s build a RF model with mtry = 4.

Tuned RF model

Model Construction

set.seed(123) # set.seed to make sure the selection of tree samples are the same if we re-run the code
rev_rf.tune = randomForest(revenue~. , train1_full, mtry = 4, ntree = 350)
rev_rf.tune
## 
## Call:
##  randomForest(formula = revenue ~ ., data = train1_full, mtry = 4,      ntree = 350) 
##                Type of random forest: regression
##                      Number of trees: 350
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 9.17e+15
##                     % Var explained: 74

An improvement in pseudo R-squared. %Var explained increases by 1%.

Prediction

Testing

# predicting the model on the testing data
rev_rf.test1 <- predict(rev_rf.tune, test1_full) 
actual_pred_rf.test1 <- as.data.frame(cbind(actuals = test1_full$revenue, predicteds = rev_rf.test1))
metrics_test_rf1  <- regr.eval(actual_pred_rf.test1$actuals, actual_pred_rf.test1$predicteds)

options(scientific=T, digits = 2) # change format for easier comparison with metrics from previous model
metrics_test_rf1[c(1,3)]
##     mae    rmse 
## 5.5e+07 9.7e+07

Training

#rev_rf.tune$predicted
rev_rf.train1 <- predict(rev_rf.tune) # same results as rev_rf.tune$predicted, both codes return the predicted values in the training set
actual_pred_rf.train1 <- as.data.frame(cbind(actuals = train1_full$revenue, predicteds = rev_rf.train1))
metrics_train_rf1 <- regr.eval(actual_pred_rf.train1$actuals, actual_pred_rf.train1$predicteds)
metrics_train_rf1[c(1,3)]
##     mae    rmse 
## 5.2e+07 9.6e+07
options(scientific=T, digits = 3) # change back to our initial format

We can see the decreases in MAE and RMSE after tuning the forest.

Conclusion

Model Linear Model (budget + vote + popularity + company + genres) Regression Tree Random Forest
R-squared(adjusted/ pseudo) 0.721 0.711 0.741
MAE - train 5.8e+07 6e+07 5.2e+07
MAE - test 6.2e+07 6.6e+07 5.5e+07
RMSE - train 9.9e+07 1e+08 9.6e+07
RMSE - test 1e+08 1.1e+08 9.7e+07

Random Forest has the best performance among three models. There is no potential overfitting as well.

Chapter 4. Profit Prediction

Correlation

str(movies)
## 'data.frame':    3225 obs. of  15 variables:
##  $ X         : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ budget    : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
##  $ popularity: num  150.4 139.1 107.4 112.3 43.9 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
##  $ date      : Date, format: "2009-12-10" "2007-05-19" ...
##  $ revenue   : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime   : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ title     : Factor w/ 3224 levels "(500) Days of Summer",..: 259 1761 2129 2420 1265 2139 2256 260 1053 310 ...
##  $ score     : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote      : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ genrecount: int  4 3 3 4 3 3 2 3 3 3 ...
##  $ profit    : num  2.55e+09 6.61e+08 6.36e+08 8.35e+08 2.41e+07 ...
##  $ profitable: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month     : num  12 5 10 7 3 5 11 4 7 3 ...
movies_reg <- subset(movies, select = - c(X,title,date,genres,genrecount,company,revenue,profitable,month))
c <- cor(movies_reg)
corrplot(c, method = "square")

We get that budget, popularity and vote have relatively strong correlations with profit. While runtime and score have moderate correlation. And the correlation between profit and month is really weak.

M = factor(movies$month) 
dummm = as.data.frame(model.matrix(~M)[,-1])
movie_M = cbind(movie, dummm) 
movies_c <- subset(movie_M, select = c(profit,M2,M3,M4,M6,M6,M7,M8,M9,M10,M11,M12))

cc <- cor(movies_c)
corrplot(cc, method = "square")

#str(movie_M)
  • Month is not correlated with profit, but we do observe that month six (June) has relatively positive effect on profit, whereas month nine (september) has relatively negative effect on profit.
library(data.table)
#apply one hot coding on the company
FactoredVariable = factor(movie_M$company) 
dumm = as.data.frame(model.matrix(~FactoredVariable)[,-1])
movie_OH = cbind(movie_M, dumm) 
#Rename the colume name
names(movie_OH)[names(movie_OH) == 'FactoredVariableParamount Pictures'] <- 'Paramount'
names(movie_OH)[names(movie_OH) == 'FactoredVariableSony Pictures'] <- 'Sony'
names(movie_OH)[names(movie_OH) == 'FactoredVariableUniversal Pictures'] <- 'Universal'
names(movie_OH)[names(movie_OH) == 'FactoredVariableWalt Disney'] <- 'Disney'
names(movie_OH)[names(movie_OH) == 'FactoredVariableWarner Bros'] <- 'Warner Bro'
#
movies_cor <- subset(movie_OH, select = c(profit,Paramount,Sony,Universal,Disney,`Warner Bro`))
c <- cor(movies_cor)
print(c)
##              profit Paramount     Sony Universal Disney Warner Bro
## profit      1.00000    0.0437  0.00881    0.0952  0.118   -0.00723
## Paramount   0.04371    1.0000 -0.08982   -0.1003 -0.125   -0.07967
## Sony        0.00881   -0.0898  1.00000   -0.1049 -0.131   -0.08334
## Universal   0.09516   -0.1003 -0.10488    1.0000 -0.146   -0.09303
## Disney      0.11825   -0.1251 -0.13084   -0.1460  1.000   -0.11605
## Warner Bro -0.00723   -0.0797 -0.08334   -0.0930 -0.116    1.00000
  • For each company we create a new column under the name of that company and mark 0 or 1 depending on if the movie is of that company. By the correlations between different companies with profit, we get that they are all dependent with profit.
G = factor(movie_OH$genres) 
dumm = as.data.frame(model.matrix(~G)[,-1])
movie_OH = cbind(movie_OH, dumm)
#str(movie_OH)
  • For each genre we create a new column under the name of that genre and mark 0 or 1 depending on if the movie is of that genre. By the correlations between different genres with profit, we get that they are all dependent with profit.

Linear Regression

Initial Model

## 
## Call:
## lm(formula = profit ~ ., data = movie_r)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.837 -0.253 -0.014  0.189 10.196 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.07441    0.11317   -0.66  0.51091    
## budget                     0.16418    0.01578   10.40  < 2e-16 ***
## popularity                 0.05699    0.01697    3.36  0.00080 ***
## runtime                    0.03582    0.01383    2.59  0.00963 ** 
## vote                       0.58674    0.01960   29.93  < 2e-16 ***
## genresAdventure            0.18645    0.04616    4.04  5.5e-05 ***
## genresAnimation            0.50587    0.07206    7.02  2.7e-12 ***
## genresComedy               0.17053    0.03760    4.53  6.0e-06 ***
## genresCrime               -0.01589    0.06085   -0.26  0.79394    
## genresDocumentary          0.32226    0.12060    2.67  0.00758 ** 
## genresDrama                0.04765    0.03790    1.26  0.20877    
## genresFamily               0.51303    0.10739    4.78  1.9e-06 ***
## genresFantasy              0.05216    0.07115    0.73  0.46350    
## genresHistory              0.08686    0.15304    0.57  0.57038    
## genresHorror               0.16382    0.05392    3.04  0.00240 ** 
## genresMusic                0.16828    0.14481    1.16  0.24529    
## genresMystery              0.06660    0.12490    0.53  0.59390    
## genresRomance              0.17402    0.08071    2.16  0.03115 *  
## genresScience Fiction      0.00999    0.07638    0.13  0.89598    
## genresThriller            -0.02116    0.06430   -0.33  0.74216    
## genresWar                 -0.21265    0.15214   -1.40  0.16231    
## genresWestern             -0.05912    0.13885   -0.43  0.67031    
## month2                    -0.01471    0.06234   -0.24  0.81349    
## month3                    -0.01656    0.06177   -0.27  0.78859    
## month4                     0.06175    0.06284    0.98  0.32582    
## month5                     0.18487    0.06204    2.98  0.00291 ** 
## month6                     0.20766    0.05989    3.47  0.00053 ***
## month7                     0.01491    0.06081    0.25  0.80628    
## month8                     0.02120    0.05933    0.36  0.72085    
## month9                     0.00411    0.05627    0.07  0.94174    
## month10                   -0.05029    0.05925   -0.85  0.39612    
## month11                    0.13956    0.06157    2.27  0.02348 *  
## month12                    0.14675    0.05874    2.50  0.01252 *  
## score                     -0.02059    0.01684   -1.22  0.22132    
## companyParamount Pictures  0.10903    0.04318    2.53  0.01162 *  
## companySony Pictures       0.03610    0.04167    0.87  0.38643    
## companyUniversal Pictures  0.15210    0.03843    3.96  7.7e-05 ***
## companyWalt Disney         0.10309    0.03342    3.08  0.00205 ** 
## companyWarner Bros         0.00720    0.04564    0.16  0.87470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.632 on 3186 degrees of freedom
## Multiple R-squared:  0.605,  Adjusted R-squared:  0.601 
## F-statistic:  129 on 38 and 3186 DF,  p-value: <2e-16
  • The first model call is lm(formula = profit ~ ., data = movie_r).
  • By the result of our initial linear regression model on profit, we get that it is unlikely we will observe a relationship between some of the predictors (like company universal, company disney, genre adventure, genre animation) and the response (profit)

Model Selection

loadPkg("leaps")
loadPkg("ISLR")
#This is essentially best fit 
reg.best <- regsubsets(profit~. , data = movie_r, nvmax = 14, nbest = 1, method = "backward")
plot(reg.best, scale = "adjr2", main = "Adjusted R^2")

plot(reg.best, scale = "r2", main = "R^2")

# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best, scale = "bic", main = "BIC")

plot(reg.best, scale = "Cp", main = "Cp")

summary(reg.best)
## Subset selection object
## Call: regsubsets.formula(profit ~ ., data = movie_r, nvmax = 14, nbest = 1, 
##     method = "backward")
## 38 Variables  (and intercept)
##                           Forced in Forced out
## budget                        FALSE      FALSE
## popularity                    FALSE      FALSE
## runtime                       FALSE      FALSE
## vote                          FALSE      FALSE
## genresAdventure               FALSE      FALSE
## genresAnimation               FALSE      FALSE
## genresComedy                  FALSE      FALSE
## genresCrime                   FALSE      FALSE
## genresDocumentary             FALSE      FALSE
## genresDrama                   FALSE      FALSE
## genresFamily                  FALSE      FALSE
## genresFantasy                 FALSE      FALSE
## genresHistory                 FALSE      FALSE
## genresHorror                  FALSE      FALSE
## genresMusic                   FALSE      FALSE
## genresMystery                 FALSE      FALSE
## genresRomance                 FALSE      FALSE
## genresScience Fiction         FALSE      FALSE
## genresThriller                FALSE      FALSE
## genresWar                     FALSE      FALSE
## genresWestern                 FALSE      FALSE
## month2                        FALSE      FALSE
## month3                        FALSE      FALSE
## month4                        FALSE      FALSE
## month5                        FALSE      FALSE
## month6                        FALSE      FALSE
## month7                        FALSE      FALSE
## month8                        FALSE      FALSE
## month9                        FALSE      FALSE
## month10                       FALSE      FALSE
## month11                       FALSE      FALSE
## month12                       FALSE      FALSE
## score                         FALSE      FALSE
## companyParamount Pictures     FALSE      FALSE
## companySony Pictures          FALSE      FALSE
## companyUniversal Pictures     FALSE      FALSE
## companyWalt Disney            FALSE      FALSE
## companyWarner Bros            FALSE      FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: backward
##           budget popularity runtime vote genresAdventure genresAnimation
## 1  ( 1 )  " "    " "        " "     "*"  " "             " "            
## 2  ( 1 )  "*"    " "        " "     "*"  " "             " "            
## 3  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 4  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 5  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 6  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 7  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 8  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 9  ( 1 )  "*"    " "        " "     "*"  "*"             "*"            
## 10  ( 1 ) "*"    " "        " "     "*"  "*"             "*"            
## 11  ( 1 ) "*"    " "        " "     "*"  "*"             "*"            
## 12  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
## 13  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
## 14  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
##           genresComedy genresCrime genresDocumentary genresDrama
## 1  ( 1 )  " "          " "         " "               " "        
## 2  ( 1 )  " "          " "         " "               " "        
## 3  ( 1 )  " "          " "         " "               " "        
## 4  ( 1 )  " "          " "         " "               " "        
## 5  ( 1 )  " "          " "         " "               " "        
## 6  ( 1 )  " "          " "         " "               " "        
## 7  ( 1 )  " "          " "         " "               " "        
## 8  ( 1 )  "*"          " "         " "               " "        
## 9  ( 1 )  "*"          " "         " "               " "        
## 10  ( 1 ) "*"          " "         " "               " "        
## 11  ( 1 ) "*"          " "         " "               " "        
## 12  ( 1 ) "*"          " "         " "               " "        
## 13  ( 1 ) "*"          " "         " "               " "        
## 14  ( 1 ) "*"          " "         " "               " "        
##           genresFamily genresFantasy genresHistory genresHorror
## 1  ( 1 )  " "          " "           " "           " "         
## 2  ( 1 )  " "          " "           " "           " "         
## 3  ( 1 )  " "          " "           " "           " "         
## 4  ( 1 )  " "          " "           " "           " "         
## 5  ( 1 )  "*"          " "           " "           " "         
## 6  ( 1 )  "*"          " "           " "           " "         
## 7  ( 1 )  "*"          " "           " "           " "         
## 8  ( 1 )  "*"          " "           " "           " "         
## 9  ( 1 )  "*"          " "           " "           " "         
## 10  ( 1 ) "*"          " "           " "           " "         
## 11  ( 1 ) "*"          " "           " "           " "         
## 12  ( 1 ) "*"          " "           " "           " "         
## 13  ( 1 ) "*"          " "           " "           " "         
## 14  ( 1 ) "*"          " "           " "           "*"         
##           genresMusic genresMystery genresRomance genresScience Fiction
## 1  ( 1 )  " "         " "           " "           " "                  
## 2  ( 1 )  " "         " "           " "           " "                  
## 3  ( 1 )  " "         " "           " "           " "                  
## 4  ( 1 )  " "         " "           " "           " "                  
## 5  ( 1 )  " "         " "           " "           " "                  
## 6  ( 1 )  " "         " "           " "           " "                  
## 7  ( 1 )  " "         " "           " "           " "                  
## 8  ( 1 )  " "         " "           " "           " "                  
## 9  ( 1 )  " "         " "           " "           " "                  
## 10  ( 1 ) " "         " "           " "           " "                  
## 11  ( 1 ) " "         " "           " "           " "                  
## 12  ( 1 ) " "         " "           " "           " "                  
## 13  ( 1 ) " "         " "           " "           " "                  
## 14  ( 1 ) " "         " "           " "           " "                  
##           genresThriller genresWar genresWestern month2 month3 month4
## 1  ( 1 )  " "            " "       " "           " "    " "    " "   
## 2  ( 1 )  " "            " "       " "           " "    " "    " "   
## 3  ( 1 )  " "            " "       " "           " "    " "    " "   
## 4  ( 1 )  " "            " "       " "           " "    " "    " "   
## 5  ( 1 )  " "            " "       " "           " "    " "    " "   
## 6  ( 1 )  " "            " "       " "           " "    " "    " "   
## 7  ( 1 )  " "            " "       " "           " "    " "    " "   
## 8  ( 1 )  " "            " "       " "           " "    " "    " "   
## 9  ( 1 )  " "            " "       " "           " "    " "    " "   
## 10  ( 1 ) " "            " "       " "           " "    " "    " "   
## 11  ( 1 ) " "            " "       " "           " "    " "    " "   
## 12  ( 1 ) " "            " "       " "           " "    " "    " "   
## 13  ( 1 ) " "            " "       " "           " "    " "    " "   
## 14  ( 1 ) " "            " "       " "           " "    " "    " "   
##           month5 month6 month7 month8 month9 month10 month11 month12 score
## 1  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 2  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 3  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 4  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     " "     " "  
## 5  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     " "     " "  
## 6  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 7  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 8  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 9  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 10  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 11  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 12  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 13  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 14  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
##           companyParamount Pictures companySony Pictures
## 1  ( 1 )  " "                       " "                 
## 2  ( 1 )  " "                       " "                 
## 3  ( 1 )  " "                       " "                 
## 4  ( 1 )  " "                       " "                 
## 5  ( 1 )  " "                       " "                 
## 6  ( 1 )  " "                       " "                 
## 7  ( 1 )  " "                       " "                 
## 8  ( 1 )  " "                       " "                 
## 9  ( 1 )  " "                       " "                 
## 10  ( 1 ) " "                       " "                 
## 11  ( 1 ) " "                       " "                 
## 12  ( 1 ) " "                       " "                 
## 13  ( 1 ) " "                       " "                 
## 14  ( 1 ) " "                       " "                 
##           companyUniversal Pictures companyWalt Disney companyWarner Bros
## 1  ( 1 )  " "                       " "                " "               
## 2  ( 1 )  " "                       " "                " "               
## 3  ( 1 )  " "                       " "                " "               
## 4  ( 1 )  " "                       " "                " "               
## 5  ( 1 )  " "                       " "                " "               
## 6  ( 1 )  " "                       " "                " "               
## 7  ( 1 )  " "                       " "                " "               
## 8  ( 1 )  " "                       " "                " "               
## 9  ( 1 )  " "                       " "                " "               
## 10  ( 1 ) " "                       " "                " "               
## 11  ( 1 ) "*"                       " "                " "               
## 12  ( 1 ) "*"                       " "                " "               
## 13  ( 1 ) "*"                       "*"                " "               
## 14  ( 1 ) "*"                       "*"                " "
movie_r$month <- as.factor(movie_r$month)
reg.best <- regsubsets(profit~. , data = movie_r, nvmax = 14, nbest = 1, method = "backward")
plot(reg.best, scale = "adjr2", main = "Adjusted R^2")

plot(reg.best, scale = "r2", main = "R^2")

# In the "leaps" package, we can use scale=c("bic","Cp","adjr2","r2")
plot(reg.best, scale = "bic", main = "BIC")

plot(reg.best, scale = "Cp", main = "Cp")

summary(reg.best)
## Subset selection object
## Call: regsubsets.formula(profit ~ ., data = movie_r, nvmax = 14, nbest = 1, 
##     method = "backward")
## 38 Variables  (and intercept)
##                           Forced in Forced out
## budget                        FALSE      FALSE
## popularity                    FALSE      FALSE
## runtime                       FALSE      FALSE
## vote                          FALSE      FALSE
## genresAdventure               FALSE      FALSE
## genresAnimation               FALSE      FALSE
## genresComedy                  FALSE      FALSE
## genresCrime                   FALSE      FALSE
## genresDocumentary             FALSE      FALSE
## genresDrama                   FALSE      FALSE
## genresFamily                  FALSE      FALSE
## genresFantasy                 FALSE      FALSE
## genresHistory                 FALSE      FALSE
## genresHorror                  FALSE      FALSE
## genresMusic                   FALSE      FALSE
## genresMystery                 FALSE      FALSE
## genresRomance                 FALSE      FALSE
## genresScience Fiction         FALSE      FALSE
## genresThriller                FALSE      FALSE
## genresWar                     FALSE      FALSE
## genresWestern                 FALSE      FALSE
## month2                        FALSE      FALSE
## month3                        FALSE      FALSE
## month4                        FALSE      FALSE
## month5                        FALSE      FALSE
## month6                        FALSE      FALSE
## month7                        FALSE      FALSE
## month8                        FALSE      FALSE
## month9                        FALSE      FALSE
## month10                       FALSE      FALSE
## month11                       FALSE      FALSE
## month12                       FALSE      FALSE
## score                         FALSE      FALSE
## companyParamount Pictures     FALSE      FALSE
## companySony Pictures          FALSE      FALSE
## companyUniversal Pictures     FALSE      FALSE
## companyWalt Disney            FALSE      FALSE
## companyWarner Bros            FALSE      FALSE
## 1 subsets of each size up to 14
## Selection Algorithm: backward
##           budget popularity runtime vote genresAdventure genresAnimation
## 1  ( 1 )  " "    " "        " "     "*"  " "             " "            
## 2  ( 1 )  "*"    " "        " "     "*"  " "             " "            
## 3  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 4  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 5  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 6  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 7  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 8  ( 1 )  "*"    " "        " "     "*"  " "             "*"            
## 9  ( 1 )  "*"    " "        " "     "*"  "*"             "*"            
## 10  ( 1 ) "*"    " "        " "     "*"  "*"             "*"            
## 11  ( 1 ) "*"    " "        " "     "*"  "*"             "*"            
## 12  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
## 13  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
## 14  ( 1 ) "*"    "*"        " "     "*"  "*"             "*"            
##           genresComedy genresCrime genresDocumentary genresDrama
## 1  ( 1 )  " "          " "         " "               " "        
## 2  ( 1 )  " "          " "         " "               " "        
## 3  ( 1 )  " "          " "         " "               " "        
## 4  ( 1 )  " "          " "         " "               " "        
## 5  ( 1 )  " "          " "         " "               " "        
## 6  ( 1 )  " "          " "         " "               " "        
## 7  ( 1 )  " "          " "         " "               " "        
## 8  ( 1 )  "*"          " "         " "               " "        
## 9  ( 1 )  "*"          " "         " "               " "        
## 10  ( 1 ) "*"          " "         " "               " "        
## 11  ( 1 ) "*"          " "         " "               " "        
## 12  ( 1 ) "*"          " "         " "               " "        
## 13  ( 1 ) "*"          " "         " "               " "        
## 14  ( 1 ) "*"          " "         " "               " "        
##           genresFamily genresFantasy genresHistory genresHorror
## 1  ( 1 )  " "          " "           " "           " "         
## 2  ( 1 )  " "          " "           " "           " "         
## 3  ( 1 )  " "          " "           " "           " "         
## 4  ( 1 )  " "          " "           " "           " "         
## 5  ( 1 )  "*"          " "           " "           " "         
## 6  ( 1 )  "*"          " "           " "           " "         
## 7  ( 1 )  "*"          " "           " "           " "         
## 8  ( 1 )  "*"          " "           " "           " "         
## 9  ( 1 )  "*"          " "           " "           " "         
## 10  ( 1 ) "*"          " "           " "           " "         
## 11  ( 1 ) "*"          " "           " "           " "         
## 12  ( 1 ) "*"          " "           " "           " "         
## 13  ( 1 ) "*"          " "           " "           " "         
## 14  ( 1 ) "*"          " "           " "           "*"         
##           genresMusic genresMystery genresRomance genresScience Fiction
## 1  ( 1 )  " "         " "           " "           " "                  
## 2  ( 1 )  " "         " "           " "           " "                  
## 3  ( 1 )  " "         " "           " "           " "                  
## 4  ( 1 )  " "         " "           " "           " "                  
## 5  ( 1 )  " "         " "           " "           " "                  
## 6  ( 1 )  " "         " "           " "           " "                  
## 7  ( 1 )  " "         " "           " "           " "                  
## 8  ( 1 )  " "         " "           " "           " "                  
## 9  ( 1 )  " "         " "           " "           " "                  
## 10  ( 1 ) " "         " "           " "           " "                  
## 11  ( 1 ) " "         " "           " "           " "                  
## 12  ( 1 ) " "         " "           " "           " "                  
## 13  ( 1 ) " "         " "           " "           " "                  
## 14  ( 1 ) " "         " "           " "           " "                  
##           genresThriller genresWar genresWestern month2 month3 month4
## 1  ( 1 )  " "            " "       " "           " "    " "    " "   
## 2  ( 1 )  " "            " "       " "           " "    " "    " "   
## 3  ( 1 )  " "            " "       " "           " "    " "    " "   
## 4  ( 1 )  " "            " "       " "           " "    " "    " "   
## 5  ( 1 )  " "            " "       " "           " "    " "    " "   
## 6  ( 1 )  " "            " "       " "           " "    " "    " "   
## 7  ( 1 )  " "            " "       " "           " "    " "    " "   
## 8  ( 1 )  " "            " "       " "           " "    " "    " "   
## 9  ( 1 )  " "            " "       " "           " "    " "    " "   
## 10  ( 1 ) " "            " "       " "           " "    " "    " "   
## 11  ( 1 ) " "            " "       " "           " "    " "    " "   
## 12  ( 1 ) " "            " "       " "           " "    " "    " "   
## 13  ( 1 ) " "            " "       " "           " "    " "    " "   
## 14  ( 1 ) " "            " "       " "           " "    " "    " "   
##           month5 month6 month7 month8 month9 month10 month11 month12 score
## 1  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 2  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 3  ( 1 )  " "    " "    " "    " "    " "    " "     " "     " "     " "  
## 4  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     " "     " "  
## 5  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     " "     " "  
## 6  ( 1 )  " "    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 7  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 8  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 9  ( 1 )  "*"    "*"    " "    " "    " "    " "     " "     "*"     " "  
## 10  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 11  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 12  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 13  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
## 14  ( 1 ) "*"    "*"    " "    " "    " "    " "     "*"     "*"     " "  
##           companyParamount Pictures companySony Pictures
## 1  ( 1 )  " "                       " "                 
## 2  ( 1 )  " "                       " "                 
## 3  ( 1 )  " "                       " "                 
## 4  ( 1 )  " "                       " "                 
## 5  ( 1 )  " "                       " "                 
## 6  ( 1 )  " "                       " "                 
## 7  ( 1 )  " "                       " "                 
## 8  ( 1 )  " "                       " "                 
## 9  ( 1 )  " "                       " "                 
## 10  ( 1 ) " "                       " "                 
## 11  ( 1 ) " "                       " "                 
## 12  ( 1 ) " "                       " "                 
## 13  ( 1 ) " "                       " "                 
## 14  ( 1 ) " "                       " "                 
##           companyUniversal Pictures companyWalt Disney companyWarner Bros
## 1  ( 1 )  " "                       " "                " "               
## 2  ( 1 )  " "                       " "                " "               
## 3  ( 1 )  " "                       " "                " "               
## 4  ( 1 )  " "                       " "                " "               
## 5  ( 1 )  " "                       " "                " "               
## 6  ( 1 )  " "                       " "                " "               
## 7  ( 1 )  " "                       " "                " "               
## 8  ( 1 )  " "                       " "                " "               
## 9  ( 1 )  " "                       " "                " "               
## 10  ( 1 ) " "                       " "                " "               
## 11  ( 1 ) "*"                       " "                " "               
## 12  ( 1 ) "*"                       " "                " "               
## 13  ( 1 ) "*"                       "*"                " "               
## 14  ( 1 ) "*"                       "*"                " "

##                           Abbreviation
## budget                               b
## popularity                           p
## runtime                              r
## vote                                 v
## genresAdventure                 gnrsAd
## genresAnimation                 gnrsAn
## genresComedy                    gnrsCm
## genresCrime                     gnrsCr
## genresDocumentary               gnrsDc
## genresDrama                     gnrsDr
## genresFamily                    gnrsFm
## genresFantasy                   gnrsFn
## genresHistory                   gnrsHs
## genresHorror                    gnrsHr
## genresMusic                     gnrsMs
## genresMystery                   gnrsMy
## genresRomance                       gR
## genresScience Fiction              gSF
## genresThriller                      gT
## genresWar                       gnrsWr
## genresWestern                   gnrsWs
## month2                              m2
## month3                              m3
## month4                              m4
## month5                              m5
## month6                              m6
## month7                              m7
## month8                              m8
## month9                              m9
## month10                            m10
## month11                            m11
## month12                            m12
## score                                s
## companyParamount Pictures          cPP
## companySony Pictures               cSP
## companyUniversal Pictures          cUP
## companyWalt Disney                  cD
## companyWarner Bros                  cB

According to the plots above, we know that predictors (budget, vote, month5, month6,month12, genre Animation, and genre family) form the best model.

Other linear models

## 
## Call:
## lm(formula = profit ~ budget + vote + M5 + M6 + M12 + GAnimation + 
##     GFamily, data = movie_OH)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.016 -0.256 -0.016  0.161  9.939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.0655     0.0135   -4.85  1.3e-06 ***
## budget        0.1853     0.0137   13.55  < 2e-16 ***
## vote          0.6236     0.0134   46.53  < 2e-16 ***
## M5            0.1705     0.0433    3.93  8.5e-05 ***
## M6            0.2104     0.0402    5.23  1.8e-07 ***
## M12           0.1500     0.0374    4.01  6.3e-05 ***
## GAnimation    0.4053     0.0665    6.10  1.2e-09 ***
## GFamily       0.4635     0.1047    4.43  9.9e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.64 on 3217 degrees of freedom
## Multiple R-squared:  0.592,  Adjusted R-squared:  0.591 
## F-statistic:  665 on 7 and 3217 DF,  p-value: <2e-16
  • With r-square equal to 0.591, 59.10% of profit can be explained by the 7 features above, like the movie budget, wheather the movie is release in May, June or Decemenber. The accuracy of the new model is close to the original one. We can reduce the number of variables to 7. With the 7 features above, we are likely to achieve the model with similar level of fitness compared to the original linear regression model.

Ridge regression

## [1]  38 201

  • The glmnet( ) function creates 201 models, with our choice of 201 \(\lambda\) values. Each model coefficients are stored in the object we named: ridge.mod
  • There are 38 coefficients for each model. The 141 \(\lambda\) values are chosen from \(10^{-4}\) to \(10^{10}\), essentially covering the ordinary least square model (\(\lambda\) = 0), and the null/constant model (\(\lambda\) approach infinity).

  • We try to find a value for lambda that is optimal. And it turns that the best lambda value is 0.032.

  • With lambda equal to 0.032, we have the r square to be 0.604, which is slightly better than the linear model we have. Since the best lambda value is close to 0, the original linear regression model is not overfit.

## [1] 1.26
##               (Intercept) companyParamount Pictures 
##                  -0.03255                   0.05064 
##      companySony Pictures companyUniversal Pictures 
##                   0.01068                   0.08523 
##        companyWalt Disney        companyWarner Bros 
##                   0.08018                   0.00256 
##                    month2                    month3 
##                  -0.05031                  -0.02952 
##                    month4                    month5 
##                   0.00378                   0.09768 
##                    month6                    month7 
##                   0.11181                   0.01346 
##                    month8                    month9 
##                  -0.04168                  -0.06057 
##                   month10                   month11 
##                  -0.05497                   0.05325 
##                   month12           genresAdventure 
##                   0.05378                   0.11871 
##           genresAnimation              genresComedy 
##                   0.23731                  -0.00308 
##               genresCrime         genresDocumentary 
##                  -0.07895                  -0.02050 
##               genresDrama              genresFamily 
##                  -0.06092                   0.17305 
##             genresFantasy             genresHistory 
##                   0.01646                  -0.01602 
##              genresHorror               genresMusic 
##                  -0.00571                  -0.04809 
##             genresMystery             genresRomance 
##                  -0.02937                   0.00378 
##     genresScience Fiction            genresThriller 
##                   0.04979                  -0.04919 
##                 genresWar             genresWestern 
##                  -0.14120                  -0.11157 
##                popularity                      vote 
##                   0.14075                   0.22603 
##                     score                    budget 
##                   0.04979                   0.14449
## [1] 0.55
## [1] 1.26
##               (Intercept) companyParamount Pictures 
##                  -0.03255                   0.05064 
##      companySony Pictures companyUniversal Pictures 
##                   0.01068                   0.08523 
##        companyWalt Disney        companyWarner Bros 
##                   0.08018                   0.00256 
##                    month2                    month3 
##                  -0.05031                  -0.02952 
##                    month4                    month5 
##                   0.00378                   0.09768 
##                    month6                    month7 
##                   0.11181                   0.01346 
##                    month8                    month9 
##                  -0.04168                  -0.06057 
##                   month10                   month11 
##                  -0.05497                   0.05325 
##                   month12           genresAdventure 
##                   0.05378                   0.11871 
##           genresAnimation              genresComedy 
##                   0.23731                  -0.00308 
##               genresCrime         genresDocumentary 
##                  -0.07895                  -0.02050 
##               genresDrama              genresFamily 
##                  -0.06092                   0.17305 
##             genresFantasy             genresHistory 
##                   0.01646                  -0.01602 
##              genresHorror               genresMusic 
##                  -0.00571                  -0.04809 
##             genresMystery             genresRomance 
##                  -0.02937                   0.00378 
##     genresScience Fiction            genresThriller 
##                   0.04979                  -0.04919 
##                 genresWar             genresWestern 
##                  -0.14120                  -0.11157 
##                popularity                      vote 
##                   0.14075                   0.22603 
##                     score                    budget 
##                   0.04979                   0.14449
## [1] 1.14

Lasso Regression

Split train and test sets

Build the model

## [1] 0.603
##    companyWarner Bros                month7                month8 
##                     0                     0                     0 
##         genresFantasy         genresMystery genresScience Fiction 
##                     0                     0                     0 
##                 score 
##                     0

Here, we see that the lowest MSE is when \(\lambda\) appro = . It has 31 non-zero coefficients.

Random Forest Regression

#str(movies_rf)
# install.packages('caTools')
library(caTools)
movie_RF <- subset(movies,select = c(company,genres,month,runtime,budget,popularity,score,vote))
set.seed(123)
split = sample.split(movie_RF, SplitRatio = 0.8)
training_set = subset(movie_RF, split == TRUE)
test_set = subset(movie_RF, split == FALSE)

RFR = randomForest(x = movie_RF,
                         y = movie$profit,
                         ntree = 500)
print(RFR)
## 
## Call:
##  randomForest(x = movie_RF, y = movie$profit, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 8.92e+15
##                     % Var explained: 64.4

The random forest regression model has the highest r-squared value: 0.639 with 500 trees.

plot(RFR)

It turns out that 500 hundred trees is enough to build the model.

#summary(regressor)
# Predicting a new result with Random Forest Regression

Mulan <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 3, runtime = 100 ,budget = 25000000, popularity = 165.13, score = 9.4,vote = 4566)
#levels(Mulan$company) <- regressor$forest$xlevels$company
#levels(Mulan$genre) <- regressor$forest$xlevels$genre
#Mulan_profit = predict(regressor, Mulan)


#Set the populartiy as same as the movie forzen's
Frozen2a <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 11, runtime = 104 ,budget = 33000000, popularity = 165.13, score = 6.79,vote = 5295)
levels(Frozen2a$company) <- RFR$forest$xlevels$company
levels(Frozen2a$genres) <- RFR$forest$xlevels$genres
Frozen2_profita = predict(RFR, Frozen2a)

Frozen <- data.frame(company = 'Walt Disney',genres = 'Animation',month = 11, runtime = 104 ,budget = 150000000, popularity = 165.13, score = 7.3,vote = 5295)
levels(Frozen$company) <- RFR$forest$xlevels$company
levels(Frozen$genres) <- RFR$forest$xlevels$genres
Frozen_profit = predict(RFR, Frozen)

Chapter 5. PCA

We want to examine if Principal Component Analysis method is good at dimensional reduction. In our data, we have 5 continuous variables to predict the revenue. We will perform principal component regression directly and see how many components are sufficient for the model.

In this part, we will also clarify the reason to scale our data in previous chapter by comparing two PCR models of different versions: centered data and non-centered data.

PCAxform <- function(df, z=TRUE) { 
  #' Obtain the dataframe with the Principal Components after the rotation. 
  #' ELo 201903 GWU DATS
  #' @param df The dataframe.
  #' @param z T/F or 0/1 for z-score to be used
  #' @return The transformed dataframe.
  #' @examples
  #' tmp = PCAxform(USArrests,TRUE)

  z = ifelse(z==TRUE || z=="true" || z=="True" || z=="T" || z=="t" || z==1 || z=="1", TRUE, FALSE) # standardize z 
  if(z) { df = data.frame(scale(df))}  # scale not safe for non-numeric colunms, but PCA requires all variables numerics to begin with.
  nmax = length(df)
  pr.out = prcomp(df,scale=z)
  df1 = data.frame()
  cnames = c()
  for( i in 1:nmax ) {
    vec = 0
    cnames = c( cnames, paste("PC",i, sep="") )
    for( j in 1:nmax ) { vec = vec + pr.out$rotation[j,i]*df[,j] }
    if( length(df1)>0 ) { df1 = data.frame(df1,vec) } else { df1 = data.frame(vec) }
    }
  colnames(df1) <- cnames
  return(df1)
}


# To-be-implemented: for z=TRUE, it will be better to have the z-scaling option for x-vars and y separately. It is actually convenient keep y in original units.
PCRxform <- function(df, y, zX=TRUE, zy=FALSE) { 
  #' Obtain the dataframe with the Principal Components after the rotation for PCRegression. Requires related function PCAxform()
  #' ELo 201903 GWU DATS
  #' @param df The dataframe.
  #' @param y The y-variable column index number(int), or the name of y-variable
  #' @param zX T/F or 0/1 for z-score used on X-variables
  #' @param zy T/F or 0/1 for z-score used on the target y-variable
  #' @return The transformed dataframe.
  #' @examples
 

  # take care of y target
  zy = ifelse(zy==TRUE || zy=="true" || zy=="True" || zy=="T" || zy=="t" || zy==1 || zy=="1", TRUE, FALSE) # standardize target y
  if( is.integer(y) ) { # y is integer
    if( y>length(df) || y<1 ) {
      print("Invalid column number")
      return(NULL)
    }
    if(zy) { df1 = data.frame( scale(df[y]) ) } else { df1 = df[y] } # save y-var in df1
    df = df[-y] # remove y-variable in df
  } else { # y is not integer, so interpret as name
    if(zy) { df1 = data.frame( scale( df[names(df) == y] ) ) } else { df1 = df[names(df) == y] }
    df = df[names(df) != y] # remove y-variable in df
  }
  if( length(df1)<1 ) {
    print("Variable name not found in data.frame")
    return(NULL)
  }
  # now transform X-vars
  zX = ifelse(zX==TRUE || zX=="true" || zX=="True" || zX=="T" || zX=="t" || zX==1 || zX=="1", TRUE, FALSE) # standardize X-vars 
  df2 = PCAxform(df,zX)
  df1 = data.frame(df1,df2) # piece them back together
  return(df1)
}

Variance

loadPkg("pls")
loadPkg("mice")
#loadPkg("ISLR")


pr = prcomp(movie_num[c(1:6)] , scale =FALSE)
pr_scale = prcomp(movie_num[c(1:6)], scale =TRUE)

We will also include revenue when checking variance.

Non-centered data

summary(pr)
## Importance of components:
##                             PC1      PC2 PC3  PC4  PC5   PC6
## Standard deviation     1.89e+08 3.10e+07 926 23.9 20.1 0.699
## Proportion of Variance 9.74e-01 2.62e-02   0  0.0  0.0 0.000
## Cumulative Proportion  9.74e-01 1.00e+00   1  1.0  1.0 1.000
#pr$rotation

Centered data

summary(pr_scale)
## Importance of components:
##                          PC1   PC2   PC3    PC4    PC5    PC6
## Standard deviation     1.768 1.108 0.894 0.6466 0.5045 0.4186
## Proportion of Variance 0.521 0.204 0.133 0.0697 0.0424 0.0292
## Cumulative Proportion  0.521 0.725 0.859 0.9284 0.9708 1.0000
#pr_scale$rotation

We see a significant differences in the variances and standard deviations of different components. Actually, the revenue and our predictors except budget are measured by different scales. And, the revenue and budget are measured in millions which are significantly bigger than the scales of vote, score, popularity …

Therefore, we recommend to scale the data before constructing the model. We will see the differences between non-centered and centered data in the following models.

PVE

pr.var <- (pr$sdev^2)
pve <- pr.var/sum(pr.var)
plot(cumsum(pve), xlab="Principal Component (standardized)", ylab ="Cumulative Proportion of Variance Explained",ylim=c(0,1),type="b", main ="Non-centered version")

pr_scale.var <- (pr_scale$sdev^2)
pve_scale  <- pr_scale.var/sum(pr_scale.var)
plot(cumsum(pve_scale), xlab="Principal Component (non-standardized)", ylab ="Cumulative Proportion of Variance Explained",ylim=c(0,1),type="b", main ="Centered version")

In non-centered version, 1 component explains almost 100% of the variance. In centered version, 1 component only explains about 50% of the variance. To reach 80%, we need 3 components. Another evidence to indicate that we should perform scaling since the the revenue and budget seem to overwhelm the components in non-centered version, causing the PC1 to capture nearly the entire variance.

Model with PCA

# we used the train and test sets in linear regression model
# at first we use the PCRxform function (Prof. EL) to scale and rotate

# training set
# we do not scale the response revenue but scale all predictors
movie_pcr = PCRxform(train1[c(1:6)],"revenue",TRUE,FALSE)  # scaled predictors
#movie_pcr

# testing set
movie_pcr_test = PCRxform(test1[c(1:6)],"revenue",TRUE,FALSE) 
#movie_pcr_test


# non-centered version, nothing is scaled
movie_pcr.nc = PCRxform(train1[c(1:6)],"revenue",FALSE, FALSE)  # non-scaled predictors
#movie_pcr.nc


movie_pcr.nc_test = PCRxform(test1[c(1:6)],"revenue",FALSE, FALSE) 
#movie_pcr.nc_test

PCR Model

# scaled data
pcr.fit <- pcr(revenue~.,data=movie_pcr,scale=FALSE,validation="CV") # build the model, as we scaled data before we use scale = F here 

# non-scaled data
pcr.fit.nc <- pcr(revenue~.,data=movie_pcr.nc,scale=FALSE,validation="CV")

Validation

Non-centered version

validationplot(pcr.fit.nc,val.type="MSEP", legend="topright")

validationplot(pcr.fit.nc,val.type="R2")

Centered version

validationplot(pcr.fit,val.type="MSEP", legend="topright") # validation with MSEP & R2

validationplot(pcr.fit,val.type="R2")

Both versions agree that 2 components give the optimized MSEP and R2.

We can see the coefficients of different components.

coefplot(pcr.fit, intercept=TRUE) # plot coefficients of different components

                                  # position 1 is the intercept, the first PC starts at position 2

We can see that after PC2, the change of coefficients is not significant comparing to the difference between the coeffcients of PC1 and PC2. Hence, we can use PC1 and PC2 to optimize the model since two components are sufficient to capture the variance.

We can make a comparision between non-centered and centered version.

Non-centered version

# we can see the summary of the model
summary(pcr.fit.nc)
## Data:    X dimension: 2176 5 
##  Y dimension: 2176 1
## Fit method: svdpc
## Number of components considered: 5
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)    1 comps    2 comps    3 comps    4 comps    5 comps
## CV        1.88e+08  122151898  108524196  107141642  102852443  102965230
## adjCV     1.88e+08  122077691  108448087  107128888  102792895  102854326
## 
## TRAINING: % variance explained
##          1 comps  2 comps  3 comps  4 comps  5 comps
## X          49.00    71.71    87.31    95.63   100.00
## revenue    57.87    66.61    67.94    70.47    71.13
summary(pcr.fit)
## Data:    X dimension: 2176 5 
##  Y dimension: 2176 1
## Fit method: svdpc
## Number of components considered: 5
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)    1 comps    2 comps   3 comps    4 comps    5 comps
## CV        1.88e+08  121720344  106321914  1.06e+08  103819563  102861273
## adjCV     1.88e+08  121665663  106238340  1.06e+08  103733078  102758138
## 
## TRAINING: % variance explained
##          1 comps  2 comps  3 comps  4 comps  5 comps
## X          48.50    71.69    87.41    95.61   100.00
## revenue    58.21    68.09    68.67    70.49    71.13

Scaled data have better variance explanation for revenue than non-scaled data.

Prediction

Let’s try the pcr model on testing data. We will use the centered version.

variance.pcr = c(1:5) # create a vector to store the variances for different numbers of components

for (i in 1:5) {
  pcr_pred <- predict(pcr.fit, movie_pcr_test , ncomp = i); # predicting the PCR model on the testing set
  
  # calculate the variance by returning the mean of  [predicted value - mean(actual values)] ^2
  variance.pcr[i] = mean((pcr_pred - mean(movie_pcr_test$revenue))^2) 
  # calculate the variance by taking mean of( (predicted value - mean(actual values) ^2 )
}

var.pcr.df = as.data.frame(cbind(variance=variance.pcr, components = c(1:5)))

#pred_act <- data.frame(cbind(pcr_pred, movie_pcr_test$revenue))


ggplot(data=var.pcr.df) +
  geom_line(aes(x = components, y = variance)) +
  geom_point(aes(x = components, y = variance)) +
  theme_light()

There is a significant increase in the variance from PC1 to PC2, after that the change of variance is not too drastic. We can say that 2 principal components can capture the majority of variance in the testing data. Our PCR model seems to perform properly in the testing data.

Linear Model

We can use principal components as the predictors for a linear model. We will use two components to build the models with two versions: centered and non-centered.

Non-centered version

pcr_lm5 <- lm(revenue ~ PC1 + PC2 , data = movie_pcr.nc) # linear model with PC1 and PC2
summary(pcr_lm5)
## 
## Call:
## lm(formula = revenue ~ PC1 + PC2, data = movie_pcr.nc)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.20e+09 -4.08e+07 -5.95e+06  2.28e+07  1.76e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 121493614    2330120    52.1   <2e-16 ***
## PC1         -89813816    1463518   -61.4   <2e-16 ***
## PC2          51258272    2149532    23.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09e+08 on 2173 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.666 
## F-statistic: 2.17e+03 on 2 and 2173 DF,  p-value: <2e-16
vif(pcr_lm5)
## PC1 PC2 
##   1   1

Centered version

pcr_lm4 <- lm(revenue ~ PC1 + PC2 , data = movie_pcr)
summary(pcr_lm4)
## 
## Call:
## lm(formula = revenue ~ PC1 + PC2, data = movie_pcr)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.11e+09 -4.03e+07 -4.95e+06  2.42e+07  1.73e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 120012282    2277573    52.7   <2e-16 ***
## PC1         -92108152    1462914   -63.0   <2e-16 ***
## PC2          54901946    2115458    25.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.06e+08 on 2173 degrees of freedom
## Multiple R-squared:  0.681,  Adjusted R-squared:  0.681 
## F-statistic: 2.32e+03 on 2 and 2173 DF,  p-value: <2e-16
vif(pcr_lm4)
## PC1 PC2 
##   1   1

All vifs are 1, nice feature from PCA method.

The scaled version gives better results than non-scaled version.

We can also use AIC and BIC for validation between non-scaled and scaled versions.

AIC(object = pcr_lm4)
## [1] 86611
BIC(object = pcr_lm4)
## [1] 86633
AIC(object = pcr_lm5)
## [1] 86710
BIC(object = pcr_lm5)
## [1] 86732
#str(movie)

if (require("pls")) {detach("package:pls", unload = T)  # for some reason package "pls" made the corrplot not proper so I detach it after using
}

Both AIC and BIC agree that the scaled version is better.

prd_pcr <- predict(pcr_lm4, newdata=movie_pcr_test)

regr.eval(movie_pcr_test$revenue, prd_pcr)
##      mae      mse     rmse     mape 
## 1.13e+08 2.41e+16 1.55e+08 7.21e+04

Conclusion

Comparing to the linear model (of numerical variables) in previous chapter, the Adjusted R-squared slightly decreases. The value drops from 71.1% to 68.1%. It is in our expectation since the goal of PCA is to reduce dimensionality. Instead of using 5 variables (budget + popularity + vote + score + runtime + ), we need only 2 variables (PC1 and PC2). We can speed up the computational process while not significantly hurting the performance of the model

Chapter 6. Profitability

In this chapter, we use different kinds of model to predict if a movie earns profit or not (box office revenue > budget -> earn profit). In our data, we use column profitable to record whether a movie earns profit or not; a movie with revenue > budget is labeled as 1, otherwise it is labeled as 0.

We use three models in this section: Logistic Regression, Classification Tree and KNN.

Prior Chi-squared test

Genres

# Prior check the dependence of genres and profitable
contable1 <- table(movie$genres, movie$profitable)
chisqres1 = chisq.test(contable1)
chisqres1
## 
##  Pearson's Chi-squared test
## 
## data:  contable1
## X-squared = 54, df = 17, p-value = 1e-05

Company

# Prior check the dependence of company and profitable
contable2 <- table(movie$company, movie$profitable)
chisqres2 = chisq.test(contable2)
chisqres2
## 
##  Pearson's Chi-squared test
## 
## data:  contable2
## X-squared = 93, df = 5, p-value <2e-16

Season

# Prior check the dependence of season and profitable
contable3 <- table(movie$season, movie$profitable)
chisqres3 = chisq.test(contable3)
chisqres3
## 
##  Pearson's Chi-squared test
## 
## data:  contable3
## X-squared = 20, df = 3, p-value = 1e-04
  • Low p-values, there is evidence that profitable is dependent on season, company and genres.

Logitic Regression

In this part we will construct the logit model on the whole dataset.

# we do some preparation with the data for our logit model

movie_nd <- movie_num # duplicate the dataframe of numerical variables

# append back the other columns
movie_nd$genres = movie$genres 
movie_nd$company = movie$company
movie_nd$season = movie$season
movie_nd$y = movie$profitable # profitable column (negative profit = 0, positive profit = 1) is saved as y

str(movie_nd)
## 'data.frame':    3225 obs. of  10 variables:
##  $ revenue   : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ budget    : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ popularity: num  150.4 139.1 107.4 112.3 43.9 ...
##  $ runtime   : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ score     : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote      : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 2 1 1 1 9 3 1 2 1 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 5 3 1 5 3 5 5 6 6 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 4 1 3 2 1 1 3 1 2 1 ...
##  $ y         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# split the train and test sets with ratio 50:50
#set.seed(1)
#movie_sample1 <- sample(2, nrow(movie_scale), replace=TRUE, prob=c(0.50, 0.50))

# create train and test sets
#train2 <- movie_scale[movie_sample1==1, 1:9]
#test2 <- movie_scale[movie_sample1==2, 1:9]

Budget and revenue are enough to decide the profitable as the profit is calculated as revenue subtracted by budget. Our pre-test with “bestglm” also shows the same result.

loadPkg("bestglm")
res.bestglm0 <- bestglm(Xy = movie_nd, family = binomial,
            IC = "AIC",                 # Information criteria for
            method = "exhaustive")
#summary(res.bestglm)
res.bestglm0$BestModels
##   revenue budget popularity runtime score  vote genres company season
## 1    TRUE   TRUE      FALSE   FALSE FALSE FALSE  FALSE   FALSE  FALSE
## 2    TRUE   TRUE      FALSE   FALSE FALSE FALSE  FALSE   FALSE   TRUE
## 3    TRUE   TRUE      FALSE    TRUE FALSE FALSE  FALSE   FALSE  FALSE
## 4    TRUE   TRUE       TRUE   FALSE FALSE FALSE  FALSE   FALSE   TRUE
## 5    TRUE   TRUE      FALSE   FALSE FALSE  TRUE  FALSE   FALSE   TRUE
##   Criterion
## 1      17.4
## 2      17.6
## 3      18.1
## 4      18.3
## 5      18.5

However, since the relationship between (revenue + budget) and profitable is too direct, we should not use them together.

In reality, we prefer budget rather than revenue to predict profit. A film manager would want to have a prediction of the profit of a movie before its main released date. The information he/she have are the budget, runtime, genres, production company, popularity, vote and score (vote and score can be obtained by a preview screening of a movie, popularity can be generated after advertisement, trailers and some leaks from a movie). Revenue should play a role as the response in the model rather than a predictor.

Model Construction

Let’s try the model with budget and other predictors

# we will use the same train and test sets as in previous chapters

# remove revenue column and use other predictors in train and test sets
train3 <- movie_nd[movie_sample == 1, 2:10] # revenue column is 1, we do not include it
test3 <- movie_nd[movie_sample == 2, 2:10]

dim(train3)
## [1] 2176    9
dim(test3)
## [1] 1049    9
prf_glm <- glm(y ~ ., data = train3, family = "binomial")
summary(prf_glm)
## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train3)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.683   0.000   0.297   0.735   1.743  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -2.05e+00   5.35e-01   -3.84  0.00012 ***
## budget                    -1.75e-08   2.51e-09   -6.99  2.8e-12 ***
## popularity                 1.61e-02   1.26e-02    1.28  0.20159    
## runtime                    1.66e-03   3.28e-03    0.50  0.61421    
## score                      2.27e-01   8.46e-02    2.68  0.00739 ** 
## vote                       2.67e-03   4.49e-04    5.95  2.7e-09 ***
## genresAdventure            1.73e-01   2.51e-01    0.69  0.49015    
## genresAnimation            2.70e-01   3.97e-01    0.68  0.49646    
## genresComedy               4.31e-01   1.91e-01    2.26  0.02360 *  
## genresCrime                7.35e-02   2.96e-01    0.25  0.80411    
## genresDocumentary          4.66e-01   5.25e-01    0.89  0.37479    
## genresDrama                7.25e-02   1.91e-01    0.38  0.70424    
## genresFamily               2.31e-01   5.54e-01    0.42  0.67656    
## genresFantasy              2.28e-01   4.32e-01    0.53  0.59765    
## genresHistory              6.55e-01   7.13e-01    0.92  0.35873    
## genresHorror               7.92e-01   3.16e-01    2.51  0.01212 *  
## genresMusic                2.74e-01   7.09e-01    0.39  0.69916    
## genresMystery             -4.23e-01   6.33e-01   -0.67  0.50393    
## genresRomance              8.50e-01   4.26e-01    2.00  0.04590 *  
## genresScience Fiction      2.64e-01   5.05e-01    0.52  0.60148    
## genresThriller            -1.50e-01   3.29e-01   -0.46  0.64724    
## genresWar                 -1.36e+00   8.29e-01   -1.64  0.10019    
## genresWestern              2.27e+00   1.08e+00    2.10  0.03582 *  
## companyParamount Pictures  9.81e-01   2.43e-01    4.03  5.5e-05 ***
## companySony Pictures       6.20e-01   2.22e-01    2.79  0.00524 ** 
## companyUniversal Pictures  8.52e-01   2.34e-01    3.64  0.00027 ***
## companyWalt Disney         8.60e-01   1.84e-01    4.67  3.0e-06 ***
## companyWarner Bros         7.69e-01   2.45e-01    3.13  0.00172 ** 
## seasonSummer               3.90e-01   1.74e-01    2.24  0.02505 *  
## seasonFall                -4.65e-02   1.62e-01   -0.29  0.77361    
## seasonWinter               7.04e-02   1.69e-01    0.42  0.67734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2409.2  on 2175  degrees of freedom
## Residual deviance: 1805.0  on 2145  degrees of freedom
## AIC: 1867
## 
## Number of Fisher Scoring iterations: 8
exp(coef(prf_glm))[24:28] # company
## companyParamount Pictures      companySony Pictures 
##                      2.67                      1.86 
## companyUniversal Pictures        companyWalt Disney 
##                      2.34                      2.36 
##        companyWarner Bros 
##                      2.16
exp(coef(prf_glm))[7:23] # genres
##       genresAdventure       genresAnimation          genresComedy 
##                 1.189                 1.310                 1.539 
##           genresCrime     genresDocumentary           genresDrama 
##                 1.076                 1.594                 1.075 
##          genresFamily         genresFantasy         genresHistory 
##                 1.260                 1.256                 1.925 
##          genresHorror           genresMusic         genresMystery 
##                 2.208                 1.315                 0.655 
##         genresRomance genresScience Fiction        genresThriller 
##                 2.340                 1.302                 0.860 
##             genresWar         genresWestern 
##                 0.256                 9.707
exp(coef(prf_glm))[29:31] # seasons
## seasonSummer   seasonFall seasonWinter 
##        1.478        0.955        1.073
#length(coef(prf_glm))

We can test the effects of different genres/companies/seasons on the prediction to see whether they are significant.

  • Genres
loadPkg("aod")  # Analysis of Overdispersed Data, used wald.test in logit example

wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 7:23)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 23.8, df = 17, P(> X2) = 0.12

It seems that different genres do not have significant effects on the response in our logit model.

  • Company
wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 24:28)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 46.2, df = 5, P(> X2) = 8.4e-09

The effects of different companies are significant.

  • Season
wald.test(b = coef(prf_glm), Sigma = vcov(prf_glm), Terms = 29:31)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 8.0, df = 3, P(> X2) = 0.045

The effects of different seasons are significant, but not as clear as the effects of different companies.

Feature Selection

#drop revenue
res.bestglm <- bestglm(Xy = train3, family = binomial,
            IC = "AIC",                 # Information criteria for
            method = "exhaustive")
#summary(res.bestglm)
res.bestglm$BestModels
##   budget popularity runtime score vote genres company season Criterion
## 1   TRUE      FALSE   FALSE  TRUE TRUE  FALSE    TRUE   TRUE      1855
## 2   TRUE       TRUE   FALSE  TRUE TRUE  FALSE    TRUE   TRUE      1856
## 3   TRUE      FALSE    TRUE  TRUE TRUE  FALSE    TRUE   TRUE      1857
## 4   TRUE       TRUE    TRUE  TRUE TRUE  FALSE    TRUE   TRUE      1858
## 5   TRUE      FALSE   FALSE  TRUE TRUE  FALSE    TRUE  FALSE      1858
#summary(res.bestglm$BestModels)
  • Best model : budget + score + vote + company + season

Best Logit Model

Build the best model

prf_glm0 <- glm(y ~  budget + score + vote + company + season, data = train3, family = "binomial")
summary(prf_glm0)
## 
## Call:
## glm(formula = y ~ budget + score + vote + company + season, family = "binomial", 
##     data = train3)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.733   0.000   0.306   0.763   1.770  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.44e+00   4.57e-01   -3.15  0.00163 ** 
## budget                    -1.83e-08   2.30e-09   -7.95  1.9e-15 ***
## score                      2.10e-01   7.18e-02    2.93  0.00340 ** 
## vote                       3.18e-03   2.37e-04   13.43  < 2e-16 ***
## companyParamount Pictures  9.64e-01   2.40e-01    4.02  5.7e-05 ***
## companySony Pictures       5.48e-01   2.19e-01    2.50  0.01237 *  
## companyUniversal Pictures  8.42e-01   2.30e-01    3.67  0.00024 ***
## companyWalt Disney         8.75e-01   1.80e-01    4.86  1.2e-06 ***
## companyWarner Bros         7.88e-01   2.41e-01    3.27  0.00109 ** 
## seasonSummer               3.84e-01   1.72e-01    2.24  0.02513 *  
## seasonFall                -6.93e-02   1.59e-01   -0.43  0.66375    
## seasonWinter               5.37e-02   1.66e-01    0.32  0.74708    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2409.2  on 2175  degrees of freedom
## Residual deviance: 1832.9  on 2164  degrees of freedom
## AIC: 1857
## 
## Number of Fisher Scoring iterations: 7
#AIC(prf_glm0) # we can check AIC and BIC
#BIC(prf_glm0)
exp(coef(prf_glm0))
##               (Intercept)                    budget 
##                     0.237                     1.000 
##                     score                      vote 
##                     1.234                     1.003 
## companyParamount Pictures      companySony Pictures 
##                     2.623                     1.729 
## companyUniversal Pictures        companyWalt Disney 
##                     2.322                     2.399 
##        companyWarner Bros              seasonSummer 
##                     2.200                     1.469 
##                seasonFall              seasonWinter 
##                     0.933                     1.055

Model Evaluation

We can validate the model on the testing set with following methods:

Hosmer and Lemeshow test

prof_glm_pred = predict(object = prf_glm0, test3, type = c("response")) # predict the model on testing data and return predicted probabilities


loadPkg("ResourceSelection") # function hoslem.test( ) for logit model evaluation

# best model
prf_Hos0 = hoslem.test(test3$y, prof_glm_pred) # Hosmer and Lemeshow test, a chi-squared test
prf_Hos0
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  test3$y, prof_glm_pred
## X-squared = 1049, df = 8, p-value <2e-16

Low p-value. Both models seem to be a good fit.

ROC curve and AUC

loadPkg("pROC") 


h0 <- roc(test3$y ~ prof_glm_pred) # using the model on testing data and see the ROC curve and AUC
auc(h0) # area-under-curve prefer 0.8 or higher.
## Area under the curve: 0.849
plot(h0)
title("ROC curve")

The area under the curve is more than 0.80. This test also agrees with the Hosmer and Lemeshow test.

McFadden

loadPkg("pscl") # use pR2( ) function to calculate McFadden statistics for model eval
prf_mcFadden = pR2(prf_glm0)
prf_mcFadden
##       llh   llhNull        G2  McFadden      r2ML      r2CU 
##  -916.473 -1204.608   576.271     0.239     0.233     0.348

23.9% the variance in y is explained by the predictors in our model. Not so bad but not so good.

Confusion Matrix

#confusion matrix
loadPkg("caret")

logit_accuracy <- c(1:5)
logit_kappa <- c(1:5)
threshold_logit <- c(0.5, 0.6, 0.7, 0.8, 0.9) # set threshold for predicted probabilities
j = 1

for (i in c(0.5, 0.6, 0.7, 0.8, 0.9)) {
conf_matrix <- confusionMatrix(data = as.factor(as.integer(prof_glm_pred>i)), reference = test3$y); # using the model on the testing data
logit_accuracy[j] <- conf_matrix$overall[1]*100;
logit_kappa[j] <- conf_matrix$overall[2]
j = j+1
}

# we can see the results at threshold = 0.9 as an example
confusionMatrix(data = as.factor(as.integer(prof_glm_pred>0.54)), reference = test3$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 130  59
##          1 130 730
##                                         
##                Accuracy : 0.82          
##                  95% CI : (0.795, 0.843)
##     No Information Rate : 0.752         
##     P-Value [Acc > NIR] : 9.29e-08      
##                                         
##                   Kappa : 0.468         
##                                         
##  Mcnemar's Test P-Value : 3.55e-07      
##                                         
##             Sensitivity : 0.500         
##             Specificity : 0.925         
##          Pos Pred Value : 0.688         
##          Neg Pred Value : 0.849         
##              Prevalence : 0.248         
##          Detection Rate : 0.124         
##    Detection Prevalence : 0.180         
##       Balanced Accuracy : 0.713         
##                                         
##        'Positive' Class : 0             
## 
# combine results into a dataframe

logit_prediction <- as.data.frame(cbind(threshold = threshold_logit, accuracy = logit_accuracy, kappa = logit_kappa))
logit_prediction 
##   threshold accuracy kappa
## 1       0.5     81.9 0.425
## 2       0.6     80.7 0.486
## 3       0.7     77.4 0.476
## 4       0.8     69.0 0.379
## 5       0.9     59.2 0.280

Classification Tree

Model Construction

loadPkg("ISLR")
loadPkg("tree")
loadPkg("rpart")
loadPkg("rpart.plot")
loadPkg("rattle") # For fancyRpartPlot (Trees) Answer "no" on installing from binary source

# we use the same training and testing data so that we can compare Classification Tree to Logit Regression
prf_dt <- rpart(y~., method = "class", data=train3) #rpart to build our tree

summary(prf_dt) # detailed summary of splits
## Call:
## rpart(formula = y ~ ., data = train3, method = "class")
##   n= 2176 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.0759      0     1.000  1.000 0.0379
## 2 0.0266      2     0.848  0.863 0.0360
## 3 0.0202      4     0.795  0.869 0.0361
## 4 0.0114      7     0.734  0.822 0.0353
## 5 0.0100      8     0.723  0.829 0.0355
## 
## Variable importance
##       vote popularity     budget     genres      score    company 
##         41         32         12          5          5          4 
##    runtime 
##          1 
## 
## Node number 1: 2176 observations,    complexity param=0.0759
##   predicted class=1  expected loss=0.242  P(node) =1
##     class counts:   527  1649
##    probabilities: 0.242 0.758 
##   left son=2 (744 obs) right son=3 (1432 obs)
##   Primary splits:
##       vote       < 260      to the left,  improve=119.0, (0 missing)
##       popularity < 15.2     to the left,  improve=111.0, (0 missing)
##       score      < 5.35     to the left,  improve= 31.0, (0 missing)
##       company    splits as  LRRRRR,       improve= 21.9, (0 missing)
##       budget     < 83500000 to the left,  improve= 13.2, (0 missing)
##   Surrogate splits:
##       popularity < 14.1     to the left,  agree=0.938, adj=0.820, (0 split)
##       budget     < 10100000 to the left,  agree=0.696, adj=0.110, (0 split)
##       score      < 5.15     to the left,  agree=0.691, adj=0.097, (0 split)
##       genres     splits as  RRRRRLRRRRRLRRRRRR, agree=0.666, adj=0.024, (0 split)
##       runtime    < 86.5     to the left,  agree=0.663, adj=0.015, (0 split)
## 
## Node number 2: 744 observations,    complexity param=0.0759
##   predicted class=1  expected loss=0.472  P(node) =0.342
##     class counts:   351   393
##    probabilities: 0.472 0.528 
##   left son=4 (364 obs) right son=5 (380 obs)
##   Primary splits:
##       budget     < 14100000 to the right, improve=27.20, (0 missing)
##       vote       < 95.5     to the left,  improve=13.00, (0 missing)
##       popularity < 9.44     to the left,  improve=12.00, (0 missing)
##       score      < 5.15     to the left,  improve= 7.47, (0 missing)
##       company    splits as  LRLLRR,       improve= 4.77, (0 missing)
##   Surrogate splits:
##       popularity < 7.19     to the right, agree=0.637, adj=0.258, (0 split)
##       vote       < 97.5     to the right, agree=0.633, adj=0.250, (0 split)
##       score      < 5.95     to the left,  agree=0.616, adj=0.214, (0 split)
##       genres     splits as  LLLRRRRLLRRLLRLLRR, agree=0.594, adj=0.170, (0 split)
##       company    splits as  RLLLLL, agree=0.591, adj=0.165, (0 split)
## 
## Node number 3: 1432 observations
##   predicted class=1  expected loss=0.123  P(node) =0.658
##     class counts:   176  1256
##    probabilities: 0.123 0.877 
## 
## Node number 4: 364 observations,    complexity param=0.0266
##   predicted class=0  expected loss=0.39  P(node) =0.167
##     class counts:   222   142
##    probabilities: 0.610 0.390 
##   left son=8 (113 obs) right son=9 (251 obs)
##   Primary splits:
##       vote       < 91.5     to the left,  improve=21.70, (0 missing)
##       popularity < 5.77     to the left,  improve=17.10, (0 missing)
##       score      < 5.05     to the left,  improve= 4.26, (0 missing)
##       genres     splits as  LRRRRLLLLRLRRLLRLL, improve= 3.43, (0 missing)
##       budget     < 37500000 to the right, improve= 2.85, (0 missing)
##   Surrogate splits:
##       popularity < 6.07     to the left,  agree=0.882, adj=0.619, (0 split)
##       runtime    < 153      to the right, agree=0.698, adj=0.027, (0 split)
##       genres     splits as  RRRRRLRRRRRRRRRRLR, agree=0.695, adj=0.018, (0 split)
## 
## Node number 5: 380 observations,    complexity param=0.0202
##   predicted class=1  expected loss=0.339  P(node) =0.175
##     class counts:   129   251
##    probabilities: 0.339 0.661 
##   left son=10 (255 obs) right son=11 (125 obs)
##   Primary splits:
##       company    splits as  LRRRRR, improve=12.00, (0 missing)
##       vote       < 45.5     to the left,  improve=10.40, (0 missing)
##       genres     splits as  RRLLLLLRRLRRLRRLRR, improve= 8.83, (0 missing)
##       popularity < 7.88     to the left,  improve= 7.62, (0 missing)
##       score      < 6.85     to the left,  improve= 3.69, (0 missing)
##   Surrogate splits:
##       budget < 9120000  to the left,  agree=0.695, adj=0.072, (0 split)
##       genres splits as  LLLLLLLLLLLLLLRLRL, agree=0.682, adj=0.032, (0 split)
##       vote   < 258      to the left,  agree=0.676, adj=0.016, (0 split)
## 
## Node number 8: 113 observations
##   predicted class=0  expected loss=0.133  P(node) =0.0519
##     class counts:    98    15
##    probabilities: 0.867 0.133 
## 
## Node number 9: 251 observations,    complexity param=0.0266
##   predicted class=1  expected loss=0.494  P(node) =0.115
##     class counts:   124   127
##    probabilities: 0.494 0.506 
##   left son=18 (109 obs) right son=19 (142 obs)
##   Primary splits:
##       budget     < 32500000 to the right, improve=5.61, (0 missing)
##       genres     splits as  LRRRL-LRLRLLRLLL-L, improve=3.97, (0 missing)
##       vote       < 196      to the left,  improve=3.28, (0 missing)
##       popularity < 8.96     to the left,  improve=3.05, (0 missing)
##       runtime    < 95.5     to the right, improve=2.67, (0 missing)
##   Surrogate splits:
##       genres     splits as  RLLRL-RRRRLRRRLR-R, agree=0.606, adj=0.092, (0 split)
##       popularity < 12.3     to the right, agree=0.590, adj=0.055, (0 split)
##       runtime    < 108      to the right, agree=0.586, adj=0.046, (0 split)
##       company    splits as  RLRRRR, agree=0.586, adj=0.046, (0 split)
##       score      < 5.35     to the left,  agree=0.578, adj=0.028, (0 split)
## 
## Node number 10: 255 observations,    complexity param=0.0202
##   predicted class=1  expected loss=0.427  P(node) =0.117
##     class counts:   109   146
##    probabilities: 0.427 0.573 
##   left son=20 (181 obs) right son=21 (74 obs)
##   Primary splits:
##       genres     splits as  RRLLLLLRRLRLLR-L-R, improve=9.30, (0 missing)
##       vote       < 53.5     to the left,  improve=7.72, (0 missing)
##       popularity < 7.7      to the left,  improve=7.02, (0 missing)
##       score      < 6.85     to the left,  improve=4.48, (0 missing)
##       runtime    < 170      to the left,  improve=3.02, (0 missing)
## 
## Node number 11: 125 observations
##   predicted class=1  expected loss=0.16  P(node) =0.0574
##     class counts:    20   105
##    probabilities: 0.160 0.840 
## 
## Node number 18: 109 observations,    complexity param=0.0114
##   predicted class=0  expected loss=0.385  P(node) =0.0501
##     class counts:    67    42
##    probabilities: 0.615 0.385 
##   left son=36 (69 obs) right son=37 (40 obs)
##   Primary splits:
##       vote       < 204      to the left,  improve=4.55, (0 missing)
##       popularity < 8.96     to the left,  improve=3.96, (0 missing)
##       genres     splits as  LLRLL-LLLRLLLRLL--, improve=3.02, (0 missing)
##       season     splits as  LRLL, improve=2.97, (0 missing)
##       runtime    < 130      to the left,  improve=2.75, (0 missing)
##   Surrogate splits:
##       popularity < 13       to the left,  agree=0.734, adj=0.275, (0 split)
##       budget     < 35500000 to the right, agree=0.661, adj=0.075, (0 split)
##       genres     splits as  LRLLR-LLLLLLLLLL--, agree=0.661, adj=0.075, (0 split)
##       season     splits as  LLRL, agree=0.642, adj=0.025, (0 split)
## 
## Node number 19: 142 observations
##   predicted class=1  expected loss=0.401  P(node) =0.0653
##     class counts:    57    85
##    probabilities: 0.401 0.599 
## 
## Node number 20: 181 observations,    complexity param=0.0202
##   predicted class=0  expected loss=0.486  P(node) =0.0832
##     class counts:    93    88
##    probabilities: 0.514 0.486 
##   left son=40 (86 obs) right son=41 (95 obs)
##   Primary splits:
##       vote       < 56       to the left,  improve=9.72, (0 missing)
##       popularity < 9.11     to the left,  improve=5.86, (0 missing)
##       score      < 6.85     to the left,  improve=5.69, (0 missing)
##       runtime    < 170      to the left,  improve=4.42, (0 missing)
##       budget     < 876000   to the right, improve=3.29, (0 missing)
##   Surrogate splits:
##       popularity < 4.72     to the left,  agree=0.901, adj=0.791, (0 split)
##       score      < 6.35     to the left,  agree=0.608, adj=0.174, (0 split)
##       runtime    < 102      to the left,  agree=0.575, adj=0.105, (0 split)
##       budget     < 3750000  to the left,  agree=0.569, adj=0.093, (0 split)
##       genres     splits as  --RRRLR--L-LR--R--, agree=0.569, adj=0.093, (0 split)
## 
## Node number 21: 74 observations
##   predicted class=1  expected loss=0.216  P(node) =0.034
##     class counts:    16    58
##    probabilities: 0.216 0.784 
## 
## Node number 36: 69 observations
##   predicted class=0  expected loss=0.275  P(node) =0.0317
##     class counts:    50    19
##    probabilities: 0.725 0.275 
## 
## Node number 37: 40 observations
##   predicted class=1  expected loss=0.425  P(node) =0.0184
##     class counts:    17    23
##    probabilities: 0.425 0.575 
## 
## Node number 40: 86 observations
##   predicted class=0  expected loss=0.314  P(node) =0.0395
##     class counts:    59    27
##    probabilities: 0.686 0.314 
## 
## Node number 41: 95 observations
##   predicted class=1  expected loss=0.358  P(node) =0.0437
##     class counts:    34    61
##    probabilities: 0.358 0.642
prf_dt$variable.importance#variable importance
##       vote popularity     budget     genres      score    company 
##    162.149    127.432     48.045     19.347     19.210     16.739 
##    runtime     season 
##      3.613      0.114
#printcp(prf_dt) # display the results 

We can visualize our tree

# plot tree 
prp(prf_dt, main = "Classification Trees for Profit Status")

#rpart.plot(prf_dt,main = "Classification Trees for Profit Status")
fancyRpartPlot(prf_dt)

Pruned Tree model

We can optimize our tree by pruning (in complex model, pruning helps to reduce overfitting).

printcp(prf_dt) # display the results 
## 
## Classification tree:
## rpart(formula = y ~ ., data = train3, method = "class")
## 
## Variables actually used in tree construction:
## [1] budget  company genres  vote   
## 
## Root node error: 527/2176 = 0.2
## 
## n= 2176 
## 
##     CP nsplit rel error xerror xstd
## 1 0.08      0       1.0    1.0 0.04
## 2 0.03      2       0.8    0.9 0.04
## 3 0.02      4       0.8    0.9 0.04
## 4 0.01      7       0.7    0.8 0.04
## 5 0.01      8       0.7    0.8 0.04
plotcp(prf_dt) # visualize cross-validation results 

Relative error is lowest at CP = 5, numbers of split = 8.

Let’s see our tree after pruning.

 prf_dt_prune <- prune(prf_dt, cp = prf_dt$cptable[which.min(prf_dt$cptable[,"xerror"]),"CP"]) # prune trees at cp where xerror is minimum
 #prf_dt_prune <- prune(prf_dt, cp = prf_dt$cptable[4,"CP"])
# plot tree 
prp(prf_dt_prune, main = "Classification Trees for Profit Status")

#rpart.plot(prf_dt_prune,main = "Classification Trees for Profit Status")
fancyRpartPlot(prf_dt_prune, main = "Classification Trees for Profit Status")

The last split was pruned.

Model Evaluation

Accuraccy

prf_dt.pred <- predict(prf_dt_prune, test3, type = "class" ) # predict the model on the testing data


table_dt <- table(actual = test3$y, predicted = prf_dt.pred) # create a confusion matrix of predicted and actual values
table_dt
##       predicted
## actual   0   1
##      0  99 161
##      1  44 745
# Accuracy Calculation
sum(diag(table_dt))/sum(table_dt) # sum(true positive + false negative)/ total 
## [1] 0.805
confusionMatrix(reference = test3$y, data = prf_dt.pred) 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  99  44
##          1 161 745
##                                         
##                Accuracy : 0.805         
##                  95% CI : (0.779, 0.828)
##     No Information Rate : 0.752         
##     P-Value [Acc > NIR] : 3.26e-05      
##                                         
##                   Kappa : 0.383         
##                                         
##  Mcnemar's Test P-Value : 5.42e-16      
##                                         
##             Sensitivity : 0.3808        
##             Specificity : 0.9442        
##          Pos Pred Value : 0.6923        
##          Neg Pred Value : 0.8223        
##              Prevalence : 0.2479        
##          Detection Rate : 0.0944        
##    Detection Prevalence : 0.1363        
##       Balanced Accuracy : 0.6625        
##                                         
##        'Positive' Class : 0             
## 
#length(prf_dt.pred)

ROC curve and AUC

# I see the method to plot ROC for classification tree in youtube and stackoverflow
# There is an option: type = "prob" when predict rpart object, it return predicted probabilities by the classification tree
# I then realize that the values with predicted probabilites > 0.5 are the values predicted as 1 in the above method (chunk Q92)
# I create the another confusion matrix for this part and set threshold = 0.5 and see that the confusion matrices are also similar


# prediction using type = "prob"
prf_dt.pred1 <- predict(prf_dt_prune, test3, type = "prob" ) 



#prf_dt.conf <- confusionMatrix(data = as.factor(as.integer(prf_dt.pred1[,2]>0.5)), reference = test3$y) 
# the accuracy will be the same as the accuracy we calculate when setting type=class, we can see the results to check it
#prf_dt.conf # the accuracy is also 80.5% and the confusion matrix is the same too

#plot ROC and see AUC
h_dt <- roc(test3$y ~ prf_dt.pred1[,2])
auc(h_dt)
## Area under the curve: 0.764
plot(h_dt)

The area under the curve is 0.764 (smaller than 0.8). Classification tree seems not be as good as Logistic Regression in this case.

KNN

chooseK = function(k, train_set, val_set, train_class, val_class){
  
  # Build knn with k neighbors considered.

  class_knn = knn(train = train_set,    #<- training set cases
                  test = val_set,       #<- test set cases
                  cl = train_class,     #<- category for classification
                  k = k,                #<- number of neighbors considered
                 use.all = TRUE)       #<- control ties between class assignments
                                        #   If true, all distances equal to the kth 
                                        #   largest are included
  
  tab = table(class_knn, val_class)
  
  # Calculate the accuracy.
  accu = sum(tab[row(tab) == col(tab)]) / sum(tab)                         
  cbind(k = k, accuracy = accu)
}
loadPkg("class")

#test3
#train3
loadPkg("gmodels")
knn_profit <- knn(train = train3[c(1:5)], test = test3[c(1:5)], cl=train3$y, k=3)
CrossTable( knn_profit,test3$y, prop.chisq = FALSE) 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1049 
## 
##  
##              | test3$y 
##   knn_profit |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        92 |        85 |       177 | 
##              |     0.520 |     0.480 |     0.169 | 
##              |     0.354 |     0.108 |           | 
##              |     0.088 |     0.081 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       168 |       704 |       872 | 
##              |     0.193 |     0.807 |     0.831 | 
##              |     0.646 |     0.892 |           | 
##              |     0.160 |     0.671 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       260 |       789 |      1049 | 
##              |     0.248 |     0.752 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
knn_k_profit = sapply(seq(1, 21, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = train3[c(1:5)],
                                             val_set = test3[c(1:5)],
                                             train_class = train3$y,
                                             val_class = test3$y))
# Reformat the results to graph the results.
#str(knn_different_k)
knn_k_profit = data.frame(k = knn_k_profit[1,],
                             accuracy = knn_k_profit[2,])

# Plot accuracy vs. k.

ggplot(knn_k_profit,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3) +
  theme_bw()

  #theme(plot.title = element_text(hjust=0.5, size = 15, face = "bold.italic"))
knn_profit <- knn(train = train3[c(1:5)], test = test3[c(1:5)], cl=train3$y, k=7)

CrossTable( knn_profit,test3$y, prop.chisq = FALSE) 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1049 
## 
##  
##              | test3$y 
##   knn_profit |         0 |         1 | Row Total | 
## -------------|-----------|-----------|-----------|
##            0 |        90 |        66 |       156 | 
##              |     0.577 |     0.423 |     0.149 | 
##              |     0.346 |     0.084 |           | 
##              |     0.086 |     0.063 |           | 
## -------------|-----------|-----------|-----------|
##            1 |       170 |       723 |       893 | 
##              |     0.190 |     0.810 |     0.851 | 
##              |     0.654 |     0.916 |           | 
##              |     0.162 |     0.689 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       260 |       789 |      1049 | 
##              |     0.248 |     0.752 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
confusionMatrix(data=test3$y, reference = knn_profit)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  90 170
##          1  66 723
##                                       
##                Accuracy : 0.775       
##                  95% CI : (0.749, 0.8)
##     No Information Rate : 0.851       
##     P-Value [Acc > NIR] : 1           
##                                       
##                   Kappa : 0.303       
##                                       
##  Mcnemar's Test P-Value : 2.02e-11    
##                                       
##             Sensitivity : 0.5769      
##             Specificity : 0.8096      
##          Pos Pred Value : 0.3462      
##          Neg Pred Value : 0.9163      
##              Prevalence : 0.1487      
##          Detection Rate : 0.0858      
##    Detection Prevalence : 0.2479      
##       Balanced Accuracy : 0.6933      
##                                       
##        'Positive' Class : 0           
## 

KNN model is the worst among three models: Logit, Classification Tree and KNN.

P/s: We can try with Random Forest Model as well and it would have the best results, but our project is too long so I do not cover RF here.

Conclusion

In the prediciton of Profitability, Logit Regression has the best prediciton in our case.

Chapter 7. KNN Models

We will use KNN model in this part to try for some predictions. (P/s: We will discuss later to see if we should include it in the final summary).

#loadPkg("class")

# we use the same train and test sets as in the linear model section

#train2 <- as.data.frame(scale(movie_num, center = TRUE, scale = TRUE))
#train2$genres <- movie[movie_sample==1, 1:14]$genres

#train set
train2 <- train1_full  # dupicate the train set with all predictors we created above
train2$revenue <- scale(train2$revenue, center = TRUE, scale = TRUE) # scale the revenue column, other numerical variables were scaled before
  

                                                                  # numerical colums range from 1:6; 7:9 are our reponses
# same with test set
test2 <- test1_full
test2$revenue <- scale(test2$revenue, center = TRUE, scale = TRUE)


str(train2)
## 'data.frame':    2176 obs. of  9 variables:
##  $ revenue   : num [1:2176, 1] 14.191 4.046 0.873 4.1 6.837 ...
##   ..- attr(*, "scaled:center")= num 1.2e+08
##   ..- attr(*, "scaled:scale")= num 1.88e+08
##  $ budget    : num  4.42 4.6 4.94 4.89 5.39 ...
##  $ popularity: num  3.355 2.165 0.411 2.395 2.908 ...
##  $ runtime   : num  2.44 1.78 1.01 1.35 1.44 ...
##  $ score     : num  1.031 -0.0157 -0.2483 -0.4809 1.1474 ...
##  $ vote      : num  7.65 2.47 0.81 1.84 4.09 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 1 1 1 9 1 2 1 2 2 2 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 1 3 5 3 5 6 6 6 5 5 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 4 3 1 1 1 2 1 2 2 1 ...
str(test2)
## 'data.frame':    1049 obs. of  9 variables:
##  $ revenue   : num [1:1049, 1] 4.573 5.25 2.555 2.524 -0.191 ...
##   ..- attr(*, "scaled:center")= num 1.24e+08
##   ..- attr(*, "scaled:scale")= num 1.83e+08
##  $ budget    : num  5.84 4.71 4.94 3.59 4.83 ...
##  $ popularity: num  3.041 2.301 0.542 2.18 0.552 ...
##  $ runtime   : num  2.779 2.588 -0.511 -0.225 1.825 ...
##  $ score     : num  0.682 1.496 1.264 -0.248 -0.481 ...
##  $ vote      : num  2.489 5.745 1.662 1.404 0.942 ...
##  $ genres    : Factor w/ 18 levels "Action","Adventure",..: 2 1 3 2 1 1 1 7 16 2 ...
##  $ company   : Factor w/ 6 levels "Others","Paramount Pictures",..: 5 1 5 1 5 1 1 2 4 1 ...
##  $ season    : Factor w/ 4 levels "Spring","Summer",..: 1 2 3 3 2 2 1 3 1 1 ...

Season

set.seed(123) # set seed to make sure the table won't change after we re-run the code
season_knn <- knn(train = train2[,c(1:6)], test = test2[,c(1:6)], cl=train2$season, k=9)


crosstab <- CrossTable(test2$season, season_knn, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1049 
## 
##  
##              | season_knn 
## test2$season |    Spring |    Summer |      Fall |    Winter | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##       Spring |        40 |        63 |        64 |        47 |       214 | 
##              |     0.187 |     0.294 |     0.299 |     0.220 |     0.204 | 
##              |     0.189 |     0.243 |     0.190 |     0.195 |           | 
##              |     0.038 |     0.060 |     0.061 |     0.045 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##       Summer |        75 |        84 |        63 |        59 |       281 | 
##              |     0.267 |     0.299 |     0.224 |     0.210 |     0.268 | 
##              |     0.354 |     0.324 |     0.187 |     0.245 |           | 
##              |     0.071 |     0.080 |     0.060 |     0.056 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##         Fall |        49 |        72 |       125 |        81 |       327 | 
##              |     0.150 |     0.220 |     0.382 |     0.248 |     0.312 | 
##              |     0.231 |     0.278 |     0.371 |     0.336 |           | 
##              |     0.047 |     0.069 |     0.119 |     0.077 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##       Winter |        48 |        40 |        85 |        54 |       227 | 
##              |     0.211 |     0.176 |     0.374 |     0.238 |     0.216 | 
##              |     0.226 |     0.154 |     0.252 |     0.224 |           | 
##              |     0.046 |     0.038 |     0.081 |     0.051 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |       212 |       259 |       337 |       241 |      1049 | 
##              |     0.202 |     0.247 |     0.321 |     0.230 |           | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
## 
set.seed(123)
knn_season_k = sapply(seq(1, 41, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = train2[c(1:6)],
                                             val_set = test2[c(1:6)],
                                             train_class = train2$season,
                                             val_class = test2$season))

knn_season_k = data.frame(k = knn_season_k[1,],
                             accuracy = knn_season_k[2,])

# Plot accuracy vs. k.
ggplot(knn_season_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3)

k= 35 gives the best accuracy

Genres

There are many genres so we won’t show the cross table

set.seed(123)
knn_genres_k = sapply(seq(1, 41, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = train2[c(1:6)],
                                             val_set = test2[c(1:6)],
                                             train_class = train2$genres,
                                             val_class = test2$genres))

knn_genres_k = data.frame(k = knn_genres_k[1,],
                             accuracy = knn_genres_k[2,])

# Plot accuracy vs. k.
ggplot(knn_genres_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3)

k = 27 gives the best accuracy

Company

set.seed(123)
company_knn <- knn(train = train2[,c(1:6)], test = test2[,c(1:6)], cl=train2$company, k=9)
crosstab <- CrossTable(test2$company, company_knn, prop.chisq = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1049 
## 
##  
##                    | company_knn 
##      test2$company |             Others | Paramount Pictures |      Sony Pictures | Universal Pictures |        Walt Disney |        Warner Bros |          Row Total | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##             Others |                447 |                 10 |                 10 |                 17 |                 48 |                  2 |                534 | 
##                    |              0.837 |              0.019 |              0.019 |              0.032 |              0.090 |              0.004 |              0.509 | 
##                    |              0.548 |              0.370 |              0.312 |              0.395 |              0.381 |              0.400 |                    | 
##                    |              0.426 |              0.010 |              0.010 |              0.016 |              0.046 |              0.002 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Paramount Pictures |                 67 |                  2 |                  1 |                  2 |                 14 |                  0 |                 86 | 
##                    |              0.779 |              0.023 |              0.012 |              0.023 |              0.163 |              0.000 |              0.082 | 
##                    |              0.082 |              0.074 |              0.031 |              0.047 |              0.111 |              0.000 |                    | 
##                    |              0.064 |              0.002 |              0.001 |              0.002 |              0.013 |              0.000 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##      Sony Pictures |                 62 |                  5 |                  3 |                  9 |                  9 |                  0 |                 88 | 
##                    |              0.705 |              0.057 |              0.034 |              0.102 |              0.102 |              0.000 |              0.084 | 
##                    |              0.076 |              0.185 |              0.094 |              0.209 |              0.071 |              0.000 |                    | 
##                    |              0.059 |              0.005 |              0.003 |              0.009 |              0.009 |              0.000 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## Universal Pictures |                 81 |                  3 |                  9 |                  7 |                 17 |                  0 |                117 | 
##                    |              0.692 |              0.026 |              0.077 |              0.060 |              0.145 |              0.000 |              0.112 | 
##                    |              0.099 |              0.111 |              0.281 |              0.163 |              0.135 |              0.000 |                    | 
##                    |              0.077 |              0.003 |              0.009 |              0.007 |              0.016 |              0.000 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##        Walt Disney |                109 |                  7 |                  7 |                  7 |                 27 |                  2 |                159 | 
##                    |              0.686 |              0.044 |              0.044 |              0.044 |              0.170 |              0.013 |              0.152 | 
##                    |              0.134 |              0.259 |              0.219 |              0.163 |              0.214 |              0.400 |                    | 
##                    |              0.104 |              0.007 |              0.007 |              0.007 |              0.026 |              0.002 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##        Warner Bros |                 50 |                  0 |                  2 |                  1 |                 11 |                  1 |                 65 | 
##                    |              0.769 |              0.000 |              0.031 |              0.015 |              0.169 |              0.015 |              0.062 | 
##                    |              0.061 |              0.000 |              0.062 |              0.023 |              0.087 |              0.200 |                    | 
##                    |              0.048 |              0.000 |              0.002 |              0.001 |              0.010 |              0.001 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##       Column Total |                816 |                 27 |                 32 |                 43 |                126 |                  5 |               1049 | 
##                    |              0.778 |              0.026 |              0.031 |              0.041 |              0.120 |              0.005 |                    | 
## -------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 
## 
set.seed(123)
knn_company_k = sapply(seq(1, 41, by = 2),  #<- set k to be odd number from 1 to 21
                         function(x) chooseK(x, 
                                             train_set = train2[c(1:6)],
                                             val_set = test2[c(1:6)],
                                             train_class = train2$company,
                                             val_class = test2$company))

knn_company_k = data.frame(k = knn_company_k[1,],
                             accuracy = knn_company_k[2,])

# Plot accuracy vs. k.
ggplot(knn_company_k,
       aes(x = k, y = accuracy)) +
  geom_line(color = "orange", size = 1.5) +
  geom_point(size = 3)

k = 23 gives the best accuracy

Chapter 8. Time series

We use time series model in this part. We will try HoltWinters and ARIMA.

# we will subset our data to get the movies from 1995-2016 since we have consistent data in that range
# we use time series with frequency = 4 (by quarter)

movie_ts <- subset(movie, (year > 1994)) # subset data after 1994
movie_ts <- subset(movie_ts, select = c("revenue", "year", "quarter")) # select 3 columns we use for time series
movie_ts <- movie_ts[order(movie_ts$year),] # order by year
#unique(movie_ts$year)


  
  
#str(movie_ts)

loadPkg("dplyr")
movie_ts = movie_ts %>% group_by(year, quarter) %>% summarise_each(funs(sum)) # calculate the total revenue for each quarter in each year
movie_ts$quarter <- factor(movie_ts$quarter, levels = c("Q1", "Q2", "Q3", "Q4")) # reorder the factor levels
movie_ts <- movie_ts[order(movie_ts$year, movie_ts$quarter),] # order by year then by quarter
#movie_ts
# divide the data into 2 sets: training and testing

movie.ts <- ts(movie_ts$revenue, frequency = 4, start = c(1995,1), end = c(2010,4)) # training data from 1995-2010

movie.ts.test <- ts(tail(movie_ts, 23)$revenue, frequency = 4, start = c(2011,1), end = c(2016,3)) # testing data from 2011-2016, used for evaluating our time series model

#movie.ts.test

Visualization

ggplot(movie_ts, aes(x=quarter, y=revenue, fill = quarter)) + 
  ggtitle("Revenue distribution by quarter") +
  geom_boxplot(fill = c("red","dodgerblue","yellow","green"),alpha = 0.8) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) 

Movies in quarter 2 seems to perform well in box office revenue.

plot(movie.ts, xlab = "Year", ylab="Revenue")

Trend

movie_dcp <- decompose(movie.ts)
summary(movie_dcp$seasonal)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.55e+09 -6.35e+08  2.92e+08  0.00e+00  9.27e+08  9.67e+08
summary(movie_dcp$random)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -1.05e+09 -4.67e+08 -8.91e+07 -3.50e+06  4.29e+08  1.99e+09         4
plot(movie_dcp)

HoltWinters

movie.ts.smth <- HoltWinters(movie.ts)
movie.ts.smth
## Holt-Winters exponential smoothing with trend and additive seasonal component.
## 
## Call:
## HoltWinters(x = movie.ts)
## 
## Smoothing parameters:
##  alpha: 0.0475
##  beta : 0.154
##  gamma: 0.313
## 
## Coefficients:
##         [,1]
## a   4.70e+09
## b   5.96e+07
## s1 -1.15e+09
## s2  1.73e+09
## s3  3.53e+08
## s4  1.03e+09
movie.ts.smth$SSE
## [1] 4.17e+19

Forecasting

plot(movie.ts.smth)

Prediction on the future

loadPkg("forecast")

#accuracy(movie_hw) 
movie_ft <- forecast(movie.ts.smth, h=23)
plot(movie_ft)

#sm <- ma(movie.ts, order=4) # 4 quarters moving average
#lines(sm, col="red") # plot

Prediction

We predict the model on the testing data

plot(movie_ft)
lines(movie.ts.test, col = 'red')

#ggplot(movie_hw)
legend("topleft",lty=1,bty = "n",col=c("red","blue"),c("actual","predicted"))

Better visualization with highcharter

loadPkg("highcharter")


hchart(movie_ft) %>% 
  hc_add_series_ts(movie.ts.test, color = "red", name = "ACTUAL") # plot the predicted and actual values in testing data, and the values in training data
# we can have a more detailed plot with only predicted and actual values in testing data
highchart(type="stock") %>%        # try type = "chart", "stock" for different visualizations
  hc_chart(type = "line") %>%      # try type = "line", bar", "column" for different visualizations
  hc_add_series_ts(movie.ts.test,name = "ACTUAL", color = "red" ) %>%   # plot the actual data labeled by red color
  hc_add_series(name = "PREDICTED", movie_ft, color = "blue" )  %>%     # plot the hw predictions labeled by blue color
  hc_legend(align = "left", verticalAlign = "top",
            layout = "vertical", x = 0, y = 100)

Nice prediction since the actual values are all in the 95% confidence interval.

ARIMA Model

Forecasting

movie_arima <- auto.arima(movie.ts) # create arima model
movie_arima <- forecast(movie_arima, h=23) # see how it forcast
plot(movie_arima)

#movie_arima

Model Evaluation

plot(movie_arima)
lines(movie.ts.test, col = 'red')

legend("topleft",lty=1,bty = "n",col=c("red","blue"),c("actual","predicted"))

We can also plot nicer with highcharter

hchart(movie_arima) %>% 
  hc_add_series_ts(movie.ts.test, color = "red", name = "ACTUAL")  # general graph
# more detailed graph, I try different stuffs here
highchart(type="chart") %>%        # try type = "chart", "stock" for different visualizations
  hc_chart(type = "column") %>%      # try type = "line", bar", "column" for different visualizations
  hc_add_series_ts(movie.ts.test,name = "ACTUAL", color = "red" ) %>%   # plot the actual data labeled by red color
  hc_add_series(name = "PREDICTED", movie_arima, color = "blue" )  %>%     # plot the hw predictions labeled by blue color
  hc_legend(align = "left", verticalAlign = "top",
            layout = "vertical", x = 0, y = 100)

In ARIMA model, some actual values are out of 95% confidence interval.

ARIMA vs HoltWinter

accuracy(movie_ft, movie.ts.test)
##                     ME     RMSE      MAE    MPE MAPE  MASE   ACF1
## Training set  1.09e+08 8.33e+08 6.51e+08  -1.23 22.9 0.812 -0.156
## Test set     -3.03e+08 1.32e+09 9.99e+08 -13.25 22.2 1.245 -0.231
##              Theil's U
## Training set        NA
## Test set          0.35
accuracy(movie_arima, movie.ts.test)
##                     ME     RMSE      MAE   MPE MAPE  MASE    ACF1
## Training set  2.25e+07 7.06e+08 5.23e+08  -4.1 17.7 0.652 -0.0119
## Test set     -3.20e+08 1.31e+09 9.88e+08 -13.7 22.2 1.232 -0.2368
##              Theil's U
## Training set        NA
## Test set         0.352

ARIMA does better on training set but the performances of two models on testing set are not much difference. Imo, HoltWinters is better as the actual values are always in the 95 confidence interval. The difference between testing and training sets are lower as well. ARIMA may result in overfitting (low variance on training set but in testing set we have high bias).